Commit 40f09646 authored by Akshatha Ambekar's avatar Akshatha Ambekar
Browse files

Minor changes in comments

parent fca4e0bb
This diff is collapsed.
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Output
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
%% Cell type:code id: tags:
``` python
#import all necessary libraries here
import os
import numpy as np
import pandas as pd
import string
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
```
%% Output
Using TensorFlow backend.
%% Cell type:code id: tags:
``` python
os.chdir('/content/drive/My Drive')
input_file_path = '/content/drive/My Drive/amazon_cells_labelled.txt'
```
%% Cell type:code id: tags:
``` python
# split reviews and labels from yelp_labelled.txt
# split reviews and labels from amazon_cells_labelled.txt
amazonData = pd.read_csv(input_file_path, delimiter='\t', header=None, names = ['Review' , 'Sentiment'])
amazonData.head()
```
%% Output
Review Sentiment
0 So there is no way for me to plug it in here i... 0
1 Good case, Excellent value. 1
2 Great for the jawbone. 1
3 Tied to charger for conversations lasting more... 0
4 The mic is great. 1
%% Cell type:code id: tags:
``` python
#form reviews and labels list
reviews_list = []
labels = []
with open(input_file_path, 'r') as f:
reviews_ = f.readlines()
```
%% Cell type:code id: tags:
``` python
#required for identifying the nouns, verbs and adjectives in the reviews
import nltk
nltk.download('averaged_perceptron_tagger')
```
%% Output
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
True
%% Cell type:code id: tags:
``` python
# Prepare data for data pre-processing steps
reviews_list_without_nouns = []
reviews_list_without_adjectives = []
reviews_list_without_verbs = []
table = str.maketrans('', '', string.punctuation)
for i in range(len(reviews_)):
review, label = reviews_[i].split('\t')
review = review.replace('.', '')
stripped = [w.translate(table) for w in review.split(' ')]
review = ' '.join(stripped)
tagged_sentence = nltk.tag.pos_tag(review.split())
#remove nouns in the review
edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
review_noun_removed = ' '.join(edited_sentence)
reviews_list_without_nouns.append(review_noun_removed.lower())
#remove adjectives in the review
tagged_sentence = nltk.tag.pos_tag(review.split())
edited_sentence = [word for word,tag in tagged_sentence if tag != 'JJ']
review_adjective_removed = ' '.join(edited_sentence)
reviews_list_without_adjectives.append(review_adjective_removed.lower())
#remove verbs in the review
tagged_sentence = nltk.tag.pos_tag(review.split())
edited_sentence = [word for word,tag in tagged_sentence if tag != 'VB' and tag != 'VBD' and tag != 'VBG' and tag != 'VBN' and tag != 'VBP']
review_verb_removed = ' '.join(edited_sentence)
reviews_list_without_verbs.append(review_verb_removed.lower())
#original reviews
reviews_list.append(review.lower())
labels.append(label)
#Visualize data before applying data pre-processing techniques
print("Original reviews after removing punctuations: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list[i][:])
print(" \n Reviews after removing punctuations and nouns: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_nouns[i][:])
print(" \n Reviews after removing punctuations and adjectives: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_adjectives[i][:])
print(" \n Reviews after removing punctuations and verbs: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_verbs[i][:])
```
%% Output
Original reviews after removing punctuations:
0
: so there is no way for me to plug it in here in the us unless i go by a converter
1
: good case excellent value
1
: great for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor problems
1
: the mic is great
Reviews after removing punctuations and nouns:
0
: so there is no way for me to plug it in here in the unless i go by a converter
1
: good case value
1
: for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor
1
: the mic is great
Reviews after removing punctuations and adjectives:
0
: so there is no way for me to plug it in here in the us unless i go by a converter
1
: case excellent value
1
: great for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor problems
1
: the mic is
Reviews after removing punctuations and verbs:
0
: so there is no way for me to it in here in the us unless i by a converter
1
: good case excellent value
1
: great for the jawbone
0
: to for conversations more than 45 minutesmajor problems
1
: the mic is great
%% Cell type:code id: tags:
``` python
reviews = '\n'.join(reviews_list)
reviews_no_nouns = '\n'.join(reviews_list_without_nouns)
reviews_no_adjectives = '\n'.join(reviews_list_without_adjectives)
reviews_no_verbs = '\n'.join(reviews_list_without_verbs)
reviews_process = ' '.join(reviews_list)
print(labels)
```
%% Output
['0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n']
%% Cell type:code id: tags:
``` python
# remove punctuations
def text_without_punct(reviews):
return reviews.translate(str.maketrans('', '', string.punctuation))
#original reviews without punctuation as a string
no_punct_text = text_without_punct(reviews)
reviews_split = reviews.split('\n')
# print("original reviews after split: \n", reviews_split[:100])
#reviews without punctuation and nouns as a string
no_punct_text_and_nouns = text_without_punct(reviews_no_nouns)
reviews_split_no_nouns = reviews_no_nouns.split('\n')
#reviews without punctuation and adjectives as a string
no_punct_text_and_adjectives = text_without_punct(reviews_no_adjectives)
reviews_split_no_adjectives = reviews_no_adjectives.split('\n')
#reviews without punctuation and verbs as a string
no_punct_text_and_verbs = text_without_punct(reviews_no_verbs)
reviews_split_no_verbs = reviews_no_verbs.split('\n')
```
%% Cell type:code id: tags:
``` python
print("original reviews after split: \n", reviews_split[0])
print("reviews with noun removed and split \n ", reviews_split_no_nouns[0])
print("reviews with adjectives removed and split \n ", reviews_split_no_adjectives[0])
print("reviews with verbs removed and split \n ", reviews_split_no_verbs[0])
```
%% Output
original reviews after split:
so there is no way for me to plug it in here in the us unless i go by a converter
reviews with noun removed and split
so there is no way for me to plug it in here in the unless i go by a converter
reviews with adjectives removed and split
so there is no way for me to plug it in here in the us unless i go by a converter
reviews with verbs removed and split
so there is no way for me to it in here in the us unless i by a converter
%% Cell type:code id: tags:
``` python
# Split the formatted no_punct_text into words
def split_in_words(no_punct_text):
return no_punct_text.split()
words = split_in_words(no_punct_text)
print("words after splitting: ", words[:50])
```
%% Output
words after splitting: ['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter', 'good', 'case', 'excellent', 'value', 'great', 'for', 'the', 'jawbone', 'tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutesmajor', 'problems', 'the', 'mic', 'is', 'great', 'i', 'have', 'to', 'jiggle', 'the', 'plug']
%% Cell type:code id: tags:
``` python
# print the total length of the words
print("Total number of words {}".format(len(words)))
# Total number of unique words
print("Total number of unique words {}".format(len(set(words))))
```
%% Output
Total number of words 10196
Total number of unique words 1905
%% Cell type:code id: tags:
``` python
# Stop word removal
from spacy.lang.en.stop_words import STOP_WORDS
words = [word for word in words if word not in STOP_WORDS]
```
%% Cell type:code id: tags:
``` python
# Clean up original reviews
clean_reviews = []
for review in reviews_list:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews.append(' '.join(review_words))
#original reviews
print("original reviews \n")
print (reviews_list[0][:100])
print (clean_reviews[0][:100])
# Clean up reviews without nouns
clean_reviews_no_nouns = []
for review in reviews_list_without_nouns:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_nouns.append(' '.join(review_words))
#reviews after nouns removed
print("\nreviews after nouns removed \n")
print (reviews_list_without_nouns[0][:100])
print (clean_reviews_no_nouns[0][:100])
# Clean up reviews without adjectives
clean_reviews_no_adjectives = []
for review in reviews_list_without_adjectives:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_adjectives.append(' '.join(review_words))
#reviews after adjectives removed
print("\nreviews after adjectives removed \n")
print (reviews_list_without_adjectives[0][:100])
print (clean_reviews_no_adjectives[0][:100])
# Clean up reviews without verbs
clean_reviews_no_verbs = []
for review in reviews_list_without_verbs:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_verbs.append(' '.join(review_words))
#reviews after verbs removed
print("\nreviews after verbs removed \n")
print (reviews_list_without_verbs[0][:100])
print (clean_reviews_no_verbs[0][:100])
```
%% Output
original reviews
so there is no way for me to plug it in here in the us unless i go by a converter
way plug converter
reviews after nouns removed
so there is no way for me to plug it in here in the unless i go by a converter
way plug converter
reviews after adjectives removed
so there is no way for me to plug it in here in the us unless i go by a converter
way plug converter
reviews after verbs removed
so there is no way for me to it in here in the us unless i by a converter
way converter
%% Cell type:code id: tags:
``` python
print("Total number of unique words after stop words removed : {}".format(len(set(words))))
```
%% Output
Total number of unique words after stop words removed : 1695
%% Cell type:code id: tags:
``` python
## Count all the words and maintain a dictionary
def word_count(words):
return Counter(words)
counts=word_count(words)
```
%% Cell type:code id: tags:
``` python
# Check for count of some words
print (counts['converter'])
```
%% Output
1
%% Cell type:code id: tags:
``` python
# define a vocabulary for the words after clean-up
def vocabulary(counts):
return list(counts.keys())
vocab = vocabulary(counts)
vocab[1]
```
%% Output
'plug'
%% Cell type:code id: tags:
``` python
# map each vocab word to an integer
def vocabulary_to_integer(vocab):
return {word:number for number,word in enumerate(vocab,1)}
vocab_to_int = vocabulary_to_integer(vocab)
print(vocab_to_int)
```
%% Output
{'way': 1, 'plug': 2, 'converter': 3, 'good': 4, 'case': 5, 'excellent': 6, 'value': 7, 'great': 8, 'jawbone': 9, 'tied': 10, 'charger': 11, 'conversations': 12, 'lasting': 13, '45': 14, 'minutesmajor': 15, 'problems': 16, 'mic': 17, 'jiggle': 18, 'line': 19, 'right': 20, 'decent': 21, 'volume': 22, 'dozen': 23, 'contacts': 24, 'imagine': 25, 'fun': 26, 'sending': 27, 'razr': 28, 'owneryou': 29, 'needless': 30, 'wasted': 31, 'money': 32, 'waste': 33, 'time': 34, 'sound': 35, 'quality': 36, 'impressed': 37, 'going': 38, 'original': 39, 'battery': 40, 'extended': 41, 'seperated': 42, 'mere': 43, '5': 44, 'ft': 45, 'started': 46, 'notice': 47, 'excessive': 48, 'static': 49, 'garbled': 50, 'headset': 51, 'design': 52, 'odd': 53, 'ear': 54, 'clip': 55, 'comfortable': 56, 'highly': 57, 'recommend': 58, 'blue': 59, 'tooth': 60, 'phone': 61, 'advise': 62, 'fooled': 63, 'far': 64, 'works': 65, 'clicks': 66, 'place': 67, 'makes': 68, 'wonder': 69, 'long': 70, 'mechanism': 71, 'went': 72, 'motorolas': 73, 'website': 74, 'followed': 75, 'directions': 76, 'pair': 77, 'bought': 78, 'use': 79, 'kindle': 80, 'fire': 81, 'absolutely': 82, 'loved': 83, 'commercials': 84, 'misleading': 85, 'run': 86, 'new': 87, 'bars': 88, 'thats': 89, 'days': 90, 'charging': 91, 'mother': 92, 'problem': 93, 'pocket': 94, 'pc': 95, 'combination': 96, 'ive': 97, 'owned': 98, '7': 99, 'months': 100, 'best': 101, 'mobile': 102, 'didnt': 103, 'think': 104, 'instructions': 105, 'provided': 106, 'helpful': 107, 'people': 108, 'couldnt': 109, 'hear': 110, 'talk': 111, 'pull': 112, 'earphone': 113, 'doesnt': 114, 'hold': 115, 'charge': 116, 'simple': 117, 'little': 118, 'breakage': 119, 'unacceptible': 120, 'product': 121, 'ideal': 122, 'like': 123, 'ears': 124, 'sensitive': 125, 'unusable': 126, 'moving': 127, 'car': 128, 'freeway': 129, 'speed': 130, 'years': 131, 'left': 132, 'contract': 133, 'hate': 134, 'ac': 135, 'included': 136, 'sure': 137, 'juicehighy': 138, 'recommended': 139, 'need': 140, '3': 141, 'mins': 142, 'book': 143, 'turn': 144, 'phonebattery': 145, 'life': 146, 'short': 147, 'kept': 148, 'poor': 149, 'performance': 150, 'fine': 151, '680': 152, 'worthless': 153, 'camera': 154, '2mp': 155, 'pics': 156, 'nice': 157, 'clear': 158, 'picture': 159, 'priced': 160, 'garbage': 161, 'audio': 162, 'bluetooth': 163, 'features': 164, 'want': 165, 'mind': 166, 'gonna': 167, 'buy': 168, 'arguing': 169, 'verizon': 170, 'dropped': 171, 'calls': 172, 'returned': 173, 'phones': 174, 'disappointed': 175, 'loud': 176, 'protection': 177, 'bulky': 178, 'usable': 179, 'keyboard': 180, 'actually': 181, 'turns': 182, 'pda': 183, 'realworld': 184, 'useful': 185, 'machine': 186, 'instead': 187, 'neat': 188, 'gadget': 189, 'pretty': 190, 'sturdy': 191, 'large': 192, 'love': 193, 'thing': 194, 'reasonable': 195, 'price': 196, 'ie': 197, 'stream': 198, 'submerged': 199, '15': 200, 'seconds': 201, 'happy': 202, '510': 203, 'complaints': 204, 'end': 205, 'buttons': 206, 'bad': 207, 'essentially': 208, 'forget': 209, 'microsofts': 210, 'tech': 211, 'support': 212, 'faceplates': 213, 'looks': 214, 'elegant': 215, 'cool': 216, 'headphones': 217, 'find': 218, 'purchase': 219, 'seriously': 220, 'different': 221, 'particular': 222, 'angle': 223, 'party': 224, 'clearly': 225, 'big': 226, 'drawback': 227, 'mp3': 228, 'player': 229, 'cover': 230, 'let': 231, 'pause': 232, 'skip': 233, 'songs': 234, 'lock': 235, 'week': 236, 'later': 237, 'activated': 238, 'suddenly': 239, 'died': 240, 'feels': 241, 'headsets': 242, 'wear': 243, 'glasses': 244, 'gets': 245, 'ipods': 246, 'device': 247, 'situations1': 248, 'work': 249, 'bmw': 250, 'series': 251, 'fairly': 252, 'quiet': 253, 'trouble': 254, 'hearing': 255, 'person': 256, 'saying': 257, 'choice': 258, 'docking': 259, 'station': 260, 'home': 261, 'beautiful': 262, 'd807wrongly': 263, 'advertised': 264, 'd807': 265, 'item': 266, 'handy': 267, 'lot': 268, 'purchased': 269, '2': 270, 'longer': 271, 'working': 272, 'everyday': 273, 'holds': 274, 'bargain': 275, 'packaged': 276, 'arrived': 277, 'intended': 278, 'runs': 279, 'quickly': 280, 'worked': 281, 'broke': 282, '6': 283, 'easy': 284, 'loves': 285, 'construction': 286, 'better': 287, 'boy': 288, 'cheaper': 289, 'loads': 290, 'super': 291, 'costs': 292, 'expect': 293, 'greater': 294, 'ease': 295, 'buds': 296, 'play': 297, 'music': 298, 'dont': 299, 'order': 300, 'plan': 301, 'found': 302, 'waaay': 303, 'tried': 304, 'bluetooths': 305, 'listener': 306, 'im': 307, 'decision': 308, 'integrated': 309, 'seamlessly': 310, 'motorola': 311, 'buyer': 312, 'beware': 313, 'flush': 314, 'toilet': 315, 'definitely': 316, 'free': 317, 'shipping': 318, 'received': 319, 'supposedly': 320, '375': 321, 'apparently': 322, 'match': 323, 'prosgood': 324, 'pictures': 325, 'styles': 326, 'black': 327, 'white': 328, 'huge': 329, 'flaw': 330, 'correctly': 331, '350': 332, 'jabra350': 333, 'reception': 334, 'piece': 335, 'fit': 336, 'rated': 337, 'impressive': 338, '13': 339, 'megapixels': 340, 'renders': 341, 'images': 342, 'fall': 343, 'expectations': 344, 'relatively': 345, 'high': 346, 'resolution': 347, 'purcashed': 348, 'wife': 349, 'ask': 350, 'slim': 351, 'light': 352, 'display': 353, 'geeky': 354, 'sex': 355, 'toast': 356, 'rocks': 357, 'oozes': 358, 'embedded': 359, 'sleek': 360, 'stylish': 361, 'leather': 362, 'fast': 363, 'compromise': 364, 'qwerty': 365, 'basic': 366, 'cell': 367, 'number': 368, 'keypad': 369, 'got': 370, 'completely': 371, 'unhappy': 372, 'winner': 373, 'setup': 374, 'simpler': 375, 'earpieces': 376, 'jabra': 377, 'fits': 378, 'comfortably': 379, 'strong': 380, 'signal': 381, 'iam': 382, 'pleased': 383, 'job': 384, 'basically': 385, 'service': 386, 'set': 387, 'weeks': 388, 'bt': 389, 'disapoinment': 390, 'small': 391, 'realize': 392, 'getting': 393, 'accompanied': 394, 'software': 395, 'brilliant': 396, 'nicely': 397, 'avoid': 398, 'damage': 399, 'definitly': 400, 'buyerbe': 401, 'careful': 402, 'majority': 403, 'logitech': 404, 'earbud': 405, 'failed': 406, 'stuff': 407, 'peachykeen': 408, 'house': 409, 'coverage': 410, 'upstairs': 411, 'basement': 412, 'voice': 413, 'recognition': 414, 'tremendous': 415, 'minute': 416, 'experienced': 417, 'drops': 418, 'area': 419, 'takes': 420, 'forever': 421, 'hours': 422, 'literally': 423, 'reccomendation': 424, 'relative': 425, 'glad': 426, 'items': 427, 'stated': 428, 'description': 429, 'screen': 430, 'sudden': 431, 'hoping': 432, 'linking': 433, '8530': 434, 'blackberry': 435, 'curve': 436, 'know': 437, 'sounds': 438, 'funny': 439, 'sketchy': 440, 'technology': 441, 'wouldnt': 442, 'wellwell': 443, 'wired': 444, 'kind': 445, 'messages': 446, 'web': 447, 'browsing': 448, 'significantly': 449, 'faster': 450, 'previous': 451, 'build': 452, 'unlike': 453, 'cheap': 454, 's': 455, 'fantastic': 456, 'perfectly': 457, 'colors': 458, 'w810i': 459, 'superb': 460, 'whine': 461, 'internet': 462, 'goesthe': 463, 'communications': 464, 'tool': 465, 'communicate': 466, 'charm': 467, 'maintain': 468, 'monkeys': 469, 'shouldnt': 470, 'obviously': 471, 'share': 472, 'dna': 473, 'copy': 474, 'humans': 475, 'bougth': 476, 'l7c': 477, 'look': 478, 'sharp': 479, 'graphics': 480, 'mode': 481, 'button': 482, 'thank': 483, 'wasting': 484, 'bethe': 485, 'igo': 486, 'chargers': 487, 'tips': 488, 'file': 489, 'browser': 490, 'offers': 491, 'options': 492, 'needshandsfree': 493, 'network': 494, 'connected': 495, 'wifes': 496, 'bluetoothmotorola': 497, 'hs850': 498, 'latest': 499, 'os': 500, 'v115g': 501, 'likes': 502, 'slow': 503, 'crawl': 504, 'recognizes': 505, 'storage': 506, 'buzzing': 507, 'override': 508, 'bluetoooth': 509, 'functionality': 510, 'awesome': 511, 'thorn': 512, 'abhor': 513, 'recently': 514, 'stay': 515, '10': 516, 'minutes': 517, 'disconnected': 518, 'incredible': 519, 'bucks': 520, 'check': 521, 'mail': 522, 'night': 523, 'backlight': 524, 'message': 525, 'lost': 526, 'replacement': 527, 'ring': 528, 'toneoverall': 529, 'lately': 530, 'extremely': 531, 'wit': 532, 'hit': 533, 'dropping': 534, 'weight': 535, 'hardly': 536, 'youll': 537, 'thin': 538, 'pleather': 539, 'useless': 540, 'simply': 541, 'deaf': 542, 'color': 543, 'prettier': 544, 'thought': 545, 'incredibly': 546, 'investment': 547, 'strange': 548, 'ticking': 549, 'noises': 550, 'ends': 551, 'electronics': 552, 'available': 553, 'fm': 554, 'transmitters': 555, 'lasts': 556, 'h500': 557, '12': 558, 'mega': 559, 'pixel': 560, 'reasonably': 561, 'good7': 562, 'nearly': 563, 'transmit': 564, 'bother': 565, 'contacting': 566, 'company': 567, 'dollar': 568, 'learned': 569, 'lesson': 570, 'form': 571, 'online': 572, 'earbugs': 573, 'means': 574, 'range': 575, 'able': 576, 'roam': 577, 'living': 578, 'room': 579, 'receptionsound': 580, 'issues': 581, 'felt': 582, 'crack': 583, 'worst': 584, 'infatuated': 585, 'freezes': 586, 'frequently4': 587, 'embarrassing': 588, 'childlike': 589, 'lightweight': 590, 'id': 591, 'expected': 592, 'consumer': 593, 'experience': 594, 'theres': 595, 'horrible': 596, 'tick': 597, 'background': 598, 'certainly': 599, 'usually': 600, 'headbands': 601, 'mess': 602, 'hair': 603, 'bit': 604, 'year': 605, 'tell': 606, 'ordered': 607, 'sony': 608, 'ericsson': 609, 'favorite': 610, 'purchases': 611, 'market': 612, 'authentic': 613, 'shine': 614, 'comfort': 615, 'excited': 616, 'cute': 617, 'mistake': 618, 'disappointment': 619, 'calendar': 620, 'sync': 621, 'customer': 622, 'additional': 623, 'gels': 624, 'whatsoever': 625, 'defeats': 626, 'purpose': 627, 'worth': 628, 'penny': 629, 'wallet': 630, 'type': 631, 'excrutiatingly': 632, 'probably': 633, 'important': 634, 'aspect': 635, 'glove': 636, 'secure': 637, 'durable': 638, 'o': 639, 'gosh': 640, 'attractive': 641, 'appears': 642, 'factor': 643, 'rubberpetroleum': 644, 'smell': 645, 'unbearable': 646, 'caused': 647, 'return': 648, 'cable': 649, 'flimsy': 650, 'scary': 651, 'earpiece': 652, 'hands': 653, 'stereo': 654, 'month': 655, 'flawlessly': 656, 'absolutel': 657, 'junk': 658, 'real': 659, '8': 660, 'drain': 661, 'potentially': 662, 'fry': 663, 'unreliable': 664, 'giving': 665, 'gave': 666, 'stars': 667, 'reversible': 668, 'rotating': 669, 'feature': 670, 'family': 671, 'seller': 672, 'plantronics': 673, 'adorable': 674, 'buying': 675, 'poorly': 676, 'contstruct': 677, 'hinge': 678, 'installed': 679, 'charged': 680, 'overnite': 681, 'handset': 682, 'cat': 683, 'attacked': 684, 'scratched': 685, 'protective': 686, 'strip': 687, 'destroying': 688, 'terrible': 689, 'razor': 690, 'v3i': 691, 'wise': 692, 'shouldve': 693, 'invented': 694, 'sooner': 695, 'trythe': 696, 'engineered': 697, 'clever': 698, 'complained': 699, 'microphone': 700, 'weak': 701, '5year': 702, 'old': 703, 'nokia': 704, '2160': 705, 'tracfone': 706, 'care': 707, 'instruction': 708, 'manual': 709, 'lacking': 710, 'alarm': 711, 'clock': 712, 'removing': 713, 'antena': 714, 'uncomfortable': 715, 'compared': 716, 'plugged': 717, 'lg': 718, 'gotten': 719, 'compliments': 720, 'state': 721, 'allow': 722, 'usage': 723, 'driving': 724, 'immediately': 725, 'ngage': 726, 'earbuds': 727, 'dialing': 728, '23': 729, 'cant': 730, 'low': 731, 'howeverthe': 732, 'riingtones': 733, 'games': 734, 'amazon': 735, 'sucks': 736, 'rip': 737, 'came': 738, 'ago': 739, 'couple': 740, 'ipod': 741, '1': 742, 'recharge': 743, 'frequentyly': 744, 'flip': 745, 'phones2': 746, 'adhesive': 747, 'disappointing': 748, 'inexpensive': 749, 'practically': 750, 'add': 751, 'boost': 752, 'times': 753, 'concrete': 754, 'knock': 755, 'wood': 756, 'transformed': 757, 'organizational': 758, 'capability': 759, 'easier': 760, 'sitting': 761, 'vehicle': 762, 'cradle': 763, 'belt': 764, 'jerks': 765, 'los': 766, 'angeles': 767, 'starter': 768, 'wireless': 769, 'loudspeaker': 770, 'option': 771, 'bumpers': 772, 'lights': 773, 'appealing': 774, 'improve': 775, 'leaks': 776, 'hot': 777, 'according': 778, 'called': 779, 'applifies': 780, 'save': 781, 'specially': 782, 'face': 783, 'transmission': 784, 's11': 785, 'data': 786, 'finished': 787, 'looking': 788, 'happier': 789, 'ill': 790, 'drivng': 791, 'starts': 792, 'ringing': 793, 'reason': 794, 'having': 795, 'auto': 796, 'reverse': 797, 'tape': 798, 'embarassing': 799, 'hurt': 800, 'try': 801, 'push': 802, 'protects': 803, 'sides': 804, 'average': 805, 'operates': 806, 'skype': 807, 'soyo': 808, 'self': 809, 'portraits': 810, 'outside': 811, 'exterior': 812, 'mentioned': 813, 'trying': 814, 'handsfree': 815, 'gadgets': 816, 'finally': 817, 'magical': 818, 'help': 819, 'crap': 820, 'shipped': 821, 'promptly': 822, 'exactly': 823, 'wanted': 824, 'comparablypriced': 825, 'offering': 826, 'today': 827, 'deal': 828, 'satisfied': 829, 'encourage': 830, 'youd': 831, 'effective': 832, 'waiting': 833, 'recieve': 834, 'prompt': 835, 'especially': 836, 'stupid': 837, 'cradles': 838, 'kits': 839, 'comes': 840, 'excelent': 841, 'cingulair': 842, 'nicer': 843, 'noticed': 844, 'era': 845, 'colored': 846, 'goes': 847, 'dead': 848, 'hoursthe': 849, 'thereplacement': 850, '2000': 851, 'cheaply': 852, 'att': 853, 'distorted': 854, 'yell': 855, 'plastic': 856, 'breaks': 857, 'oh': 858, 'forgot': 859, 'mention': 860, 'weird': 861, 'effect': 862, 'iriver': 863, 'spinn': 864, 'unit': 865, 'fond': 866, 'magnetic': 867, 'strap': 868, 'overall': 869, 'psyched': 870, 'appointments': 871, 'note': 872, 'appearance': 873, 'bland': 874, 'model': 875, 'sanyo': 876, 'survived': 877, 'dozens': 878, 'blacktop': 879, 'earphones': 880, 'finds': 881, 'away': 882, 'enter': 883, 'modest': 884, 'cellular': 885, 'clarity': 886, 'warning': 887, 'wish': 888, 'awsome': 889, 'drained': 890, 'dying': 891, 'earpad': 892, 'onlyi': 893, 'displeased': 894, 'defect': 895, 'risk': 896, 'built': 897, 'difficult': 898, 'install': 899, 'restored': 900, 'purchasing': 901, 'jx10': 902, 'moto': 903, 'q': 904, 'figure': 905, 'searched': 906, 'size': 907, 'key': 908, 'pad': 909, 'lit': 910, 'hard': 911, 'wasnt': 912, 'portable': 913, 'colleague': 914, 'receptiona': 915, 'expensive': 916, 'fully': 917, 'bed': 918, 'turned': 919, 'wifi': 920, '20': 921, 'morning': 922, 'reading': 923, 'memory': 924, 'card': 925, 'wearing': 926, 'hat': 927, 'sunglasses': 928, 'timely': 929, 'shipment': 930, 'solid': 931, 'surefire': 932, 'gx2': 933, 'bt50': 934, 'computer': 935, 'buyers': 936, 'remorse': 937, 'accessoryone': 938, 'inexcusable': 939, 'returning': 940, 'changing': 941, 'carriers': 942, 'tmobile': 943, 'update': 944, 'procedure': 945, 'cumbersome': 946, 'delivery': 947, 'vx9900': 948, 'env': 949, 'switch': 950, 'rocketed': 951, 'destination': 952, 'unknown': 953, 'longwearing': 954, 'conditions': 955, 'worthwhile': 956, 'usefulness': 957, 'verizons': 958, 'bills': 959, 'understand': 960, 'pricing': 961, 'plans': 962, 'overnight': 963, 'batteries': 964, 'wont': 965, 'regret': 966, 'user': 967, 'friendly': 968, 'unfortunately': 969, 'ability': 970, 'receiving': 971, 'pitiful': 972, 'respect': 973, 'exchanged': 974, 'results': 975, 'stuck': 976, 'max': 977, 'mute': 978, 'hybrid': 979, 'palmtopcameracellphone': 980, 'excels': 981, 'roles': 982, 'bt250v': 983, 'liked': 984, 'wrong': 985, 'described': 986, '11': 987, 'bose': 988, 'noise': 989, 'cancelling': 990, 'amazing': 991, 'nyc': 992, 'commuter': 993, 'defective': 994, 'given': 995, 'star': 996, 'unacceptableunless': 997, 'holster': 998, 'photo': 999, 'ad': 1000, 'greatno': 1001, 'earlier': 1002, 'review': 1003, 'noted': 1004, 'happens': 1005, 'frog': 1006, 'eye': 1007, 'catching': 1008, 'pushed': 1009, 'function': 1010, 'amazed': 1011, 'aluminum': 1012, 'palm': 1013, 'vx': 1014, 'wellit': 1015, 'protected': 1016, 'handheld': 1017, 'tools': 1018, 'sturdiness': 1019, 'orders': 1020, 'timeframe': 1021, 'source': 1022, 'waterproof': 1023, 'complaint': 1024, 'standard': 1025, '5of': 1026, 'thanks': 1027, 'things': 1028, 'ended': 1029, 'sliding': 1030, 'edge': 1031, 'pants': 1032, 'pockets': 1033, 'store': 1034, 'ugly': 1035, 'shield': 1036, 'incrediable': 1037, 'improvement': 1038, 'refuse': 1039, 'refund': 1040, 'replace': 1041, 'accidentally': 1042, 'activate': 1043, 'gentletouch': 1044, 'touch': 1045, 'listening': 1046, 'threw': 1047, 'window': 1048, 'took': 1049, 'drop': 1050, 'inches': 1051, 'kitchen': 1052, 'counter': 1053, 'crackedi': 1054, 'laughing': 1055, 'trunk': 1056, 'carried': 1057, 'conversation': 1058, 'hitch': 1059, 'practical': 1060, 'ample': 1061, 'eargels': 1062, 'channel': 1063, 'directly': 1064, 'increase': 1065, 'ones': 1066, 'properly': 1067, 'missed': 1068, 'numerous': 1069, 'sucked': 1070, 'shifting': 1071, 'bubbling': 1072, 'peeling': 1073, 'scratch': 1074, 'nothingi': 1075, 'droid': 1076, 'zero': 1077, 'exercise': 1078, 'frustration': 1079, 'earset': 1080, 'outgoing': 1081, 'total': 1082, 'package': 1083, 'understanding': 1084, 'patient': 1085, 'wirefly': 1086, 'stari': 1087, 'contact': 1088, 'cingularatt': 1089, 'inform': 1090, 'practice': 1091, 'aggravating': 1092, 'friends': 1093, 'enjoy': 1094, 'virgin': 1095, 'muddy': 1096, 'casing': 1097, 'wires': 1098, 'insert': 1099, 'glued': 1100, 'slid': 1101, 'isnt': 1102, 'plantronincs': 1103, 'continues': 1104, 'flawed': 1105, 'disapointing': 1106, 'fourth': 1107, 'hated': 1108, 'fixes': 1109, 'accessing': 1110, 'downloading': 1111, 'ringtones': 1112, 'performing': 1113, 'functions': 1114, 'barely': 1115, 'constantly': 1116, 'unacceptable': 1117, 'joke': 1118, 'said': 1119, 'happening': 1120, 'forced': 1121, 'stop': 1122, 'adapters': 1123, 'walked': 1124, 'procedures': 1125, 'reset': 1126, 'wiping': 1127, 'strength': 1128, 'plays': 1129, 'louder': 1130, 'speaker': 1131, 'constructed': 1132, 'menus': 1133, 'navigate': 1134, 'recessed': 1135, 'holding': 1136, 'onid': 1137, 'avoiding': 1138, 'brokeni': 1139, 'smoking': 1140, 'sprint': 1141, 'linked': 1142, 'effort': 1143, 'possesed': 1144, 'idea': 1145, 'trash': 1146, 'research': 1147, 'development': 1148, 'division': 1149, 'knows': 1150, 'theyre': 1151, 'killer': 1152, 'course': 1153, 'breaking': 1154, 'infuriating': 1155, 'walkman': 1156, 'charges': 1157, 'feel': 1158, 'europe': 1159, 'asia': 1160, 'clipping': 1161, 'deffinitely': 1162, '50': 1163, 'cents': 1164, 'upandcoming': 1165, 'behing': 1166, '5020': 1167, 'comfortible': 1168, '24': 1169, 'day': 1170, 'pain': 1171, 'quick': 1172, 'arrival': 1173, 'fraction': 1174, 'samsungcrap': 1175, 'crappy': 1176, 'samsung': 1177, 'e715': 1178, 'seeen': 1179, 'stopped': 1180, 'needed': 1181, 'operate': 1182, 'screenthis': 1183, 'interface': 1184, 'decade': 1185, 'compete': 1186, 'designs': 1187, 'paired': 1188, 'treo': 1189, '700w': 1190, 'usb': 1191, 'transceiver': 1192, 'steer': 1193, 'genuine': 1194, 'replacementr': 1195, 'pens': 1196, 'come': 1197, 'threepack': 1198, 'buyit': 1199, 'beats': 1200, 'fingers': 1201, 'plus': 1202, 'believe': 1203, 'steep': 1204, 'point': 1205, 'cases': 1206, 'normally': 1207, 'apart': 1208, 'haul': 1209, 'dissapointing': 1210, 'brand': 1211, 'extra': 1212, 'originally': 1213, 'discarded': 1214, 'phonesmp3': 1215, 'players': 1216, 'posted': 1217, 'detailed': 1218, 'comments': 1219, 'grey': 1220, 'red': 1221, 'pay': 1222, 'guess': 1223, 'existing': 1224, 'cds': 1225, 'connection': 1226, 'surprised': 1227, 'reviews': 1228, 'fabulous': 1229, 'currently': 1230, 'firstperson': 1231, 'shooters': 1232, 'delay': 1233, 'messes': 1234, 'bitpim': 1235, 'program': 1236, 'internetto': 1237, 'transfer': 1238, 'phonethe': 1239, 'accessory': 1240, 'manufacturer': 1241, 'performed': 1242, 'awful': 1243, 'muffled': 1244, 'tinny': 1245, 'incoming': 1246, 'severe': 1247, 'echo': 1248, 'windresistant': 1249, 'overly': 1250, 'replaceeasy': 1251, 'contacted': 1252, 'told': 1253, 'warranty': 1254, 'produce': 1255, 'receipt': 1256, 'luck': 1257, 'linksys': 1258, 'exchange': 1259, 'refurb': 1260, 'bar': 1261, 'placed': 1262, 'snug': 1263, 'heavyit': 1264, 'keeps': 1265, 'falling': 1266, 'utter': 1267, 'promised': 1268, 'loop': 1269, 'tiny': 1270, 'spring': 1271, 'latch': 1272, 'visor': 1273, 'tries': 1274, 'download': 1275, 'address': 1276, 'rebootsoverall': 1277, 'rate': 1278, 'tungsten': 1279, 'e2': 1280, 'flipphones': 1281, 'welldesigned': 1282, 'smoothly': 1283, 'study': 1284, 'interested': 1285, 'sins': 1286, 'industrial': 1287, 'happened': 1288, 'tracking': 1289, 'access': 1290, 'detachable': 1291, 'continue': 1292, 'pairing': 1293, 'periodically': 1294, 'upload': 1295, 'locks': 1296, 'screens': 1297, 'flash': 1298, 'randomly': 1299, 'locked': 1300, 'truly': 1301, '325': 1302, 'cellphone': 1303, 'wornout': 1304, 'ringer': 1305, 'choices': 1306, 'tones': 1307, 'acceptable': 1308, 'balance': 1309, 'ready': 1310, 'prime': 1311, 'coming': 1312, 'upbeat': 1313, 'chinese': 1314, 'forgeries': 1315, 'abound': 1316, 'explain': 1317, 'jack': 1318, 'ca42': 1319, 'crisp': 1320, 'smallest': 1321, 'stays': 1322, 'biggest': 1323, 'drains': 1324, 'superfast': 1325, 'ergonomic': 1326, 'theory': 1327, 'stand': 1328, 'video': 1329, 'clips': 1330, 'occupied': 1331, 'distracting': 1332, 'hour': 1333, 'entire': 1334, 'accept': 1335, 'cbr': 1336, 'mp3s': 1337, 'preferably': 1338, 'ripped': 1339, 'windows': 1340, 'media': 1341, 'beat': 1342, 'shots': 1343, 'sos': 1344, 'signals': 1345, 'allows': 1346, 'connect': 1347, 'miniusb': 1348, 'near': 1349, 'open': 1350, 'allowing': 1351, 'startac': 1352, 'regretted': 1353, 'outperform': 1354, 'china': 1355, 'v325i': 1356, 'numbers': 1357, 'sim': 1358, '3o': 1359, 'phonemy': 1360, 'r': 1361, 'crashed': 1362, 'replaced': 1363, 'quit': 1364, '18': 1365, 'iphone': 1366, '4s': 1367, 'despite': 1368, 'connecting': 1369, 'multiple': 1370, 'power': 1371, 'sources': 1372, 'imac': 1373, 'external': 1374, 'wall': 1375, 'outlet': 1376, 'etc': 1377, 'bells': 1378, 'whistles': 1379, 'mediocre': 1380, 'slide': 1381, 'grip': 1382, 'prevents': 1383, 'slipping': 1384, 'hand': 1385, 'onethis': 1386, 'span': 1387, 'exclaim': 1388, 'whoa': 1389, 'tv': 1390, 'corded': 1391, 'freedom': 1392, 'passed': 1393, 'mark': 1394, 'shows': 1395, 'signs': 1396, '100': 1397, 'functional': 1398, 'soft': 1399, 'tight': 1400, 'cut': 1401, 'shape': 1402, 'copier': 1403, 'sizes': 1404, 'sent': 1405, 'sold': 1406, 'units': 1407, 'pros': 1408, 'provides': 1409, 'classy': 1410, 'krussel': 1411, 'tracfonewebsite': 1412, 'toactivate': 1413, 'good4': 1414, 'texas': 1415, 'dit': 1416, '5320': 1417, 'mainly': 1418, 'soon': 1419, 'blueant': 1420, 'supertooth': 1421, 'metro': 1422, 'pcs': 1423, 'schr450': 1424, 'slider': 1425, 'premium': 1426, 'plugs': 1427, 'plenty': 1428, 'capacity': 1429, 'confortable': 1430, 'somewhat': 1431, 'periods': 1432, 'ant': 1433, 'hey': 1434, 'pleasantly': 1435, 'suprised': 1436, 'cost': 1437, 'dustpan': 1438, 'indoors': 1439, 'disposable': 1440, 'puff': 1441, 'smoke': 1442, 'convenient': 1443, 'ride': 1444, 'smoother': 1445, 'nano': 1446, 'itmy': 1447, 'son': 1448, 'dissapointed': 1449, 'reccommend': 1450, 'carries': 1451, 'highest': 1452, 'antiglare': 1453, 'protector': 1454, 'date': 1455, 'smartphone': 1456, 'atleast': 1457, 'addition': 1458, 'amp': 1459, 'reoccurebottom': 1460, 'cingular': 1461, 'methe': 1462, 'creaks': 1463, 'wooden': 1464, 'floor': 1465, 'apartment': 1466, 'generally': 1467, 'inconspicuous': 1468, 'boot': 1469, 'slowly': 1470, 'sorry': 1471, 'impossible': 1472, 'refused': 1473, 'upgrade': 1474, 'discount': 1475, 'securly': 1476, 'possibility': 1477, 'double': 1478, 'booking': 1479, 'break': 1480, 'entertainment': 1481, 'communication': 1482, 'managementoh': 1483, 'activesync': 1484, '42': 1485, 'optimal': 1486, 'synchronization': 1487, 'disgusting': 1488, 'coupon': 1489, 'rare': 1490, 'instance': 1491, 'perfect': 1492, 'ps3': 1493, 'cheapy': 1494, 'lots': 1495, 'sounded': 1496, 'talking': 1497, 'shouting': 1498, 'telephone': 1499, 'wind': 1500, 'yes': 1501, 'shiny': 1502, 'grtting': 1503, '744': 1504, 'v3c': 1505, 'thumbs': 1506, 'exceeds': 1507, 'feet': 1508, 'sight': 1509, 'improper': 1510, 'chargelife': 1511, 'checked': 1512, 'ordering': 1513, 'effects': 1514, 'palms': 1515, 'awkward': 1516, 'hoped': 1517, 'father': 1518, 'v265': 1519, 'pads': 1520, 'easily': 1521, 'stops': 1522, 'intermittently': 1523, 'reaching': 1524, 'row': 1525, 'send': 1526, 'keys': 1527, 'be3': 1528, 'nightmare': 1529, 'describe': 1530, 'speakerphone': 1531, 'cassette': 1532, 'current': 1533, 'cellphones': 1534, 'planning': 1535, 'says': 1536, 'dirty': 1537, 'autoanswer': 1538, 'read': 1539, 'havent': 1540, 'products': 1541, 'sensor': 1542, 'reliability': 1543, 'beeping': 1544, 'letting': 1545, 'dieing': 1546, 'laptop': 1547, 'ir': 1548, 'yearsgreat': 1549, 'cancellation': 1550, 'counterfeit': 1551, 'travled': 1552, 'swivel': 1553, 'sister': 1554, 'dualpurpose': 1555, '8125': 1556, 'keeping': 1557, 'inside': 1558, 'bottowm': 1559, 'lineanother': 1560, 'gimmick': 1561, 'opens': 1562, 'broken': 1563, 'causing': 1564, 'discomfort': 1565, 'trust': 1566, 'loudglad': 1567, 'maintains': 1568, 'flawless': 1569, 'normal': 1570, 'making': 1571, 'fails': 1572, 'wrongfirst': 1573, 'devices': 1574, 'utterly': 1575, 'confusing': 1576, 'lose': 1577, 'holder': 1578, 'cutouts': 1579, 'landline': 1580, 'loops': 1581, 'material': 1582, 'flaws': 1583, 'exceptional': 1584, 'owning': 1585, 'official': 1586, 'oem': 1587, 'loudest': 1588, 'setting': 1589, 'competitors': 1590, 'saved': 1591, 'alot': 1592, 'cuts': 1593, 'beep': 1594, 'ok': 1595, 'totally': 1596, 'unintelligible': 1597, 'word': 1598, 'restart': 1599, 'managed': 1600, 'bend': 1601, 'leaf': 1602, 'metal': 1603, 'stress': 1604, 'leopard': 1605, 'print': 1606, 'wonderfully': 1607, 'wild': 1608, 'saggy': 1609, 'floppy': 1610, 'looses': 1611, 'abovepretty': 1612, 'soundwise': 1613, 'snap': 1614, '8525': 1615, 'carry': 1616, 'fliptop': 1617, 'loose': 1618, 'wobbly': 1619, 'eventually': 1620, 'receive': 1621, 'seat': 1622, 'fulfills': 1623, 'requirements': 1624, 'fact': 1625, 'rests': 1626, 'lightly': 1627, 'websites': 1628, 'rating': 1629, 'cables': 1630, 'lap': 1631, 'controls': 1632, 'accessable': 1633, 'christmas': 1634, 'rest': 1635, 'joy': 1636, 'satisifed': 1637, '2005': 1638, 's710a': 1639, 'wow': 1640, 'specs': 1641, 'armband': 1642, 'allot': 1643, 'clearer': 1644, 'keypads': 1645, 'reach': 1646, 'ericson': 1647, 'z500a': 1648, 'motor': 1649, 'control': 1650, 'center': 1651, 'voltage': 1652, 'humming': 1653, 'equipment': 1654, 'certain': 1655, 'places': 1656, 'girl': 1657, 'complain': 1658, 'wake': 1659, 'styling': 1660, 'restocking': 1661, 'fee': 1662, 'darn': 1663, 'lousy': 1664, 'seen': 1665, 'sweetest': 1666, 'securely': 1667, 'hook': 1668, 'directed': 1669, 'canal': 1670, 'problemvery': 1671, 'unsatisfactory': 1672, 'videos': 1673, 'negatively': 1674, 'adapter': 1675, 'provide': 1676, 'hype': 1677, 'assumed': 1678, 'lense': 1679, 'covered': 1680, 'falls': 1681, 'text': 1682, 'messaging': 1683, 'tricky': 1684, 'painful': 1685, 'lasted': 1686, 'blew': 1687, 'flops': 1688, 'smudged': 1689, 'touches': 1690, 'disappoint': 1691, 'infra': 1692, 'port': 1693, 'irda': 1694, 'answer': 1695}
%% Cell type:code id: tags:
``` python
# verify if the length of created dictionary
print(len(vocab_to_int))
```
%% Output
1695
%% Cell type:code id: tags:
``` python
# make a list of words in positive reviews and in negative reviews.
positive_counts = Counter()
negative_counts = Counter()
for i in range(len(clean_reviews)):
if(str(labels[i]) == '1\n'):
for word in clean_reviews[i].split(" "):
positive_counts[word] += 1
else:
for word in clean_reviews[i].split(" "):
negative_counts[word] += 1
```
%% Cell type:code id: tags:
``` python
positive_counts.most_common()[:10]
```
%% Output
[('great', 92),
('phone', 86),
('good', 62),
('works', 46),
('product', 33),
('quality', 31),
('headset', 31),
('sound', 27),
('excellent', 26),
('price', 25)]
%% Cell type:code id: tags:
``` python
negative_counts.most_common()[:10]
```
%% Output
[('phone', 76),
('dont', 26),
('work', 25),
('battery', 23),
('product', 22),
('use', 20),
('ear', 19),
('money', 18),
('quality', 18),
('time', 16)]
%% Cell type:code id: tags:
``` python
print("Labels : {}".format(set(labels)))
```
%% Output
Labels : {'1\n', '0\n'}
%% Cell type:code id: tags:
``` python
vocab_to_int['jawbone']
```
%% Output
9
%% Cell type:code id: tags:
``` python
# 1 for positive label and 0 for negative label
def one_hot(labels):
one_hot_labels = []
for i in range(len(labels)):
if (labels[i] == '1\n'):
one_hot_labels.append(1)
else:
one_hot_labels.append(0)
return one_hot_labels
encoded_labels = one_hot(labels)
```
%% Cell type:code id: tags:
``` python
print("Length of encoded labels :{} ".format(len(encoded_labels)))
print("Length of reviews list :{} ".format(len(reviews_list)))
```
%% Output
Length of encoded labels :1000
Length of reviews list :1000
%% Cell type:code id: tags:
``` python
#prepare data that can be used for training language models
# reviews_ints = []
# for review in clean_reviews:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with nouns removed from reviews that can be used for training language models
# reviews_ints = []
# for review in clean_reviews_no_nouns:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with adjectives removed from reviews that can be used for training language models
reviews_ints = []
for review in clean_reviews_no_adjectives:
reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with verbs removed from reviews that can be used for training language models
# reviews_ints = []
# for review in clean_reviews_no_verbs:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
```
%% Cell type:code id: tags:
``` python
# This step is to see if any review is empty and we remove it. Otherwise the input will be all zeroes.
review_lens = Counter([len(x) for x in reviews_ints])
empty_reviews_present = (review_lens[0]>0)
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))
```
%% Output
Zero-length reviews: 20
Maximum review length: 14
%% Cell type:code id: tags:
``` python
if empty_reviews_present:
print('Number of reviews before removing outliers: ', len(reviews_ints))
## remove any reviews/labels with zero length from the reviews_ints list.
# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])
print('Number of reviews after removing outliers: ', len(reviews_ints))
```
%% Output
Number of reviews before removing outliers: 1000
Number of reviews after removing outliers: 980
%% Cell type:code id: tags:
``` python
# Logic for padding the data
def pad_features(reviews_ints, seq_length):
return pad_sequences(reviews_ints, maxlen = seq_length)
```
%% Cell type:code id: tags:
``` python
seq_length = 200
features = pad_features(reviews_ints, seq_length=seq_length)
# print first 10 values
print(features[:10 ,:10])
```
%% Output
[[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]]
%% Cell type:code id: tags:
``` python
#Split the entire dataset into train , test and validation set
train_frac = 0.8
test_and_val_frac = 0.2
# test and val are half each of 0.2
val_frac = 0.5
test_frac = 0.5
def train_test_val_split(features):
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size = test_and_val_frac, train_size = train_frac, random_state = 5, shuffle = True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = test_frac, train_size = val_frac, random_state = 5, shuffle = True)
return X_train, X_val, X_test
def train_test_val_labels(encoded_labels):
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size = test_and_val_frac, train_size = train_frac, random_state = 5, shuffle = True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = test_frac, train_size = val_frac, random_state = 5, shuffle = True)
return y_train, y_val, y_test
train_x, val_x, test_x = train_test_val_split(features)
train_y, val_y, test_y = train_test_val_labels(encoded_labels)
```
%% Cell type:code id: tags:
``` python
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))
## print out the shapes of your resultant label data
print("\t\t\t Label:")
print("Train set labels: \t\t{}".format(train_y[:10]))
print("\nValidation set labels: \t\t{}".format(val_y[:10]))
print("\nTest set labels: \t\t{}".format(test_y[:10]))
```
%% Output
Feature Shapes:
Train set: (784, 200)
Validation set: (98, 200)
Test set: (98, 200)
Label:
Train set labels: [0 1 1 1 1 0 1 1 1 0]
Validation set labels: [0 1 0 0 1 0 0 0 1 0]
Test set labels: [1 1 1 0 0 1 0 1 0 1]
%% Cell type:code id: tags:
``` python
# create Tensor datasets for train, test and val
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 32
# SHUFFLE training data
train_loader = DataLoader(train_data, batch_size=batch_size, drop_last= True, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, batch_size=1, drop_last = True)
```
%% Cell type:code id: tags:
``` python
# obtain one batch of training data and label.
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)
```
%% Output
Sample input size: torch.Size([32, 200])
Sample input:
tensor([[ 0, 0, 0, ..., 998, 435, 745],
[ 0, 0, 0, ..., 584, 97, 1211],
[ 0, 0, 0, ..., 0, 1158, 478],
...,
[ 0, 0, 0, ..., 61, 605, 606],
[ 0, 0, 0, ..., 224, 110, 225],
[ 0, 0, 0, ..., 0, 653, 465]], dtype=torch.int32)
Sample label size: torch.Size([32])
Sample label:
tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 1, 0, 0, 0, 0, 1])
%% Cell type:code id: tags:
``` python
# Check if GPU is available.
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
print('Training on GPU.')
else:
print('No GPU available, training on CPU.')
```
%% Output
No GPU available, training on CPU.
%% Cell type:code id: tags:
``` python
class SentimentLSTM(nn.Module):
"""
The LSTM model that will be used to perform Sentiment analysis.
"""
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.3):
"""
Initialize the model by setting up the layers.
"""
super(SentimentLSTM, self).__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
# define embedding, LSTM, dropout and Linear layers here
self.embedding=nn.Embedding(vocab_size, embedding_dim)
self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout=nn.Dropout(0.6)
#Linear and sigmoid layer
self.fc1=nn.Linear(hidden_dim, 128)
self.fc2=nn.Linear(128, 64)
self.fc3=nn.Linear(64, 16)
self.fc4=nn.Linear(16,output_size)
self.sig=nn.Sigmoid()
def forward(self, x, hidden):
"""
Perform a forward pass of our model on some input and hidden state.
"""
batch_size=x.size(0)
#Embedding and LSTM output
x = x.long()
embedd=self.embedding(x)
lstm_out, hidden=self.lstm(embedd, hidden)
# stack up lstm outputs
lstm_out = lstm_out.reshape(-1, self.hidden_dim)
# dropout and fully-connected layer
out = self.dropout(lstm_out)
out=self.fc1(out)
out=self.dropout(out)
out=self.fc2(out)
out=self.dropout(out)
out=self.fc3(out)
out=self.dropout(out)
out=self.fc4(out)
# sigmoid function
sig_out = self.sig(out)
# reshape to be batch_size first
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1] # get last batch of labels
return sig_out, hidden
def init_hidden(self, batch_size):
''' Initializes hidden state '''
# Create two new tensors with sizes n_layers x batch_size x hidden_dim,
# initialized to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
```
%% Cell type:code id: tags:
``` python
# SentimentLSTM: Instantiate the model with these hyperparameters
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 1000
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)
```
%% Output
SentimentLSTM(
(embedding): Embedding(1696, 1000)
(lstm): LSTM(1000, 256, num_layers=2, batch_first=True, dropout=0.3)
(dropout): Dropout(p=0.6, inplace=False)
(fc1): Linear(in_features=256, out_features=128, bias=True)
(fc2): Linear(in_features=128, out_features=64, bias=True)
(fc3): Linear(in_features=64, out_features=16, bias=True)
(fc4): Linear(in_features=16, out_features=1, bias=True)
(sig): Sigmoid()
)
%% Cell type:code id: tags:
``` python
# loss and optimization functions
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
```
%% Cell type:code id: tags:
``` python
#Training and Validation
epochs = 2
training_loss = []
validation_loss = []
counter = 0
print_every = 1
clip=1 # gradient clipping
# move model to GPU, if available
if(train_on_gpu):
net.cuda()
net.train()
# train for some number of epochs
for e in range(epochs):
# initialize hidden state
h = net.init_hidden(batch_size)
# batch loop
for inputs, labels in train_loader:
counter += 1000
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
# zero accumulated gradients
net.zero_grad()
# get the output from the model
if inputs is not None:
output, h = net(inputs, h)
# calculate the loss and perform backprop
loss = criterion(output.squeeze(), labels.float())
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
optimizer.step()
# loss stats
if counter % print_every == 0:
# Get validation loss
val_h = net.init_hidden(batch_size)
val_losses = []
net.eval()
for inputs, labels in valid_loader:
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
output, val_h = net(inputs, val_h)
val_loss = criterion(output.squeeze(), labels.float())
val_losses.append(val_loss.item())
net.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
#add loss
print("adding loss at end of each epoch")
training_loss.append((e+1, loss.item()))
validation_loss.append((e+1, val_loss.item()))
```
%% Output
Epoch: 1/2... Step: 1000... Loss: 0.690470... Val Loss: 0.691225
Epoch: 1/2... Step: 2000... Loss: 0.718603... Val Loss: 0.694506
Epoch: 1/2... Step: 3000... Loss: 0.716760... Val Loss: 0.690104
Epoch: 1/2... Step: 4000... Loss: 0.718793... Val Loss: 0.692191
Epoch: 1/2... Step: 5000... Loss: 0.661848... Val Loss: 0.691820
Epoch: 1/2... Step: 6000... Loss: 0.685669... Val Loss: 0.689586
Epoch: 1/2... Step: 7000... Loss: 0.799937... Val Loss: 0.689925
Epoch: 1/2... Step: 8000... Loss: 0.673774... Val Loss: 0.692445
Epoch: 1/2... Step: 9000... Loss: 0.742956... Val Loss: 0.698549
Epoch: 1/2... Step: 10000... Loss: 0.793578... Val Loss: 0.707168
Epoch: 1/2... Step: 11000... Loss: 0.770233... Val Loss: 0.704841
Epoch: 1/2... Step: 12000... Loss: 0.739290... Val Loss: 0.699305
Epoch: 1/2... Step: 13000... Loss: 0.704423... Val Loss: 0.693387
Epoch: 1/2... Step: 14000... Loss: 0.811122... Val Loss: 0.689733
Epoch: 1/2... Step: 15000... Loss: 0.732553... Val Loss: 0.687914
Epoch: 1/2... Step: 16000... Loss: 0.782480... Val Loss: 0.686848
Epoch: 1/2... Step: 17000... Loss: 0.626204... Val Loss: 0.687253
Epoch: 1/2... Step: 18000... Loss: 0.719936... Val Loss: 0.689473
Epoch: 1/2... Step: 19000... Loss: 0.695033... Val Loss: 0.690230
Epoch: 1/2... Step: 20000... Loss: 0.728073... Val Loss: 0.688848
Epoch: 1/2... Step: 21000... Loss: 0.748381... Val Loss: 0.685583
Epoch: 1/2... Step: 22000... Loss: 0.726212... Val Loss: 0.683036
Epoch: 1/2... Step: 23000... Loss: 0.799253... Val Loss: 0.680151
Epoch: 1/2... Step: 24000... Loss: 0.722329... Val Loss: 0.684737
adding loss at end of each epoch
Epoch: 2/2... Step: 25000... Loss: 0.725197... Val Loss: 0.688515
Epoch: 2/2... Step: 26000... Loss: 0.752304... Val Loss: 0.687670
Epoch: 2/2... Step: 27000... Loss: 0.736704... Val Loss: 0.682892
Epoch: 2/2... Step: 28000... Loss: 0.708128... Val Loss: 0.676917
Epoch: 2/2... Step: 29000... Loss: 0.680346... Val Loss: 0.673774
Epoch: 2/2... Step: 30000... Loss: 0.627726... Val Loss: 0.671572
Epoch: 2/2... Step: 31000... Loss: 0.651418... Val Loss: 0.671215
Epoch: 2/2... Step: 32000... Loss: 0.666228... Val Loss: 0.665977
Epoch: 2/2... Step: 33000... Loss: 0.649267... Val Loss: 0.655129
Epoch: 2/2... Step: 34000... Loss: 0.705611... Val Loss: 0.643028
Epoch: 2/2... Step: 35000... Loss: 0.619797... Val Loss: 0.630864
Epoch: 2/2... Step: 36000... Loss: 0.655143... Val Loss: 0.615987
Epoch: 2/2... Step: 37000... Loss: 0.687029... Val Loss: 0.601638
Epoch: 2/2... Step: 38000... Loss: 0.689281... Val Loss: 0.593264
Epoch: 2/2... Step: 39000... Loss: 0.590002... Val Loss: 0.578642
Epoch: 2/2... Step: 40000... Loss: 0.574886... Val Loss: 0.561962
Epoch: 2/2... Step: 41000... Loss: 0.570394... Val Loss: 0.552343
Epoch: 2/2... Step: 42000... Loss: 0.539024... Val Loss: 0.542348
Epoch: 2/2... Step: 43000... Loss: 0.560558... Val Loss: 0.531923
Epoch: 2/2... Step: 44000... Loss: 0.649121... Val Loss: 0.522496
Epoch: 2/2... Step: 45000... Loss: 0.539961... Val Loss: 0.514507
Epoch: 2/2... Step: 46000... Loss: 0.502711... Val Loss: 0.523169
Epoch: 2/2... Step: 47000... Loss: 0.453961... Val Loss: 0.540095
Epoch: 2/2... Step: 48000... Loss: 0.586832... Val Loss: 0.560444
adding loss at end of each epoch
%% Cell type:code id: tags:
``` python
#Display training and validation loss at end of training the LSTM model
training_loss_arr = np.array(training_loss)
validation_loss_arr = np.array(validation_loss)
plt.plot(training_loss_arr[:, 0], training_loss_arr[:, 1], color='#FFC107', label='Training Loss')
plt.plot(validation_loss_arr[:, 0], validation_loss_arr[:, 1], color='#008080', label='Validation Loss')
plt.xlabel('Epochs')
plt.title('Sentiment Analysis - Graph displaying Loss')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
def predict():
review_texts = []
sentiments = []
test_losses = [] # track loss
num_correct = 0
# init hidden state
h = net.init_hidden(1)
net.eval()
# batch loop
for inputs, labels in test_loader:
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
# zero accumulated gradients
net.zero_grad()
# get the output from the model
output, h = net(inputs, h)
# calculate the loss and perform backprop
loss = criterion(output.squeeze(), labels.float())
test_losses.append(loss.item())
# convert output probabilities to predicted class (0 or 1)
pred = torch.round(output.squeeze())
# compare predictions to true label
correct_tensor = pred.eq(labels.float().view_as(pred))
correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
num_correct += np.sum(correct)
# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))
# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))
predict()
```
%% Output
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py:498: UserWarning: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])) is deprecated. Please ensure they have the same size.
return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
Test loss: 0.677
Test accuracy: 0.724
%% Cell type:code id: tags:
``` python
def predict_review(review, seq_length = 100):
device = "cuda" if torch.cuda.is_available() else "cpu"
#pre-process the review
clean_review_words = []
review_preprocess = review.translate(str.maketrans('', '', string.punctuation))
split_words = review_preprocess.split(' ')
clean_review_words = [word.lower() for word in split_words if word not in string.punctuation and word.lower() not in STOP_WORDS]
encoded_words = [vocab_to_int[word] for word in clean_review_words]
padded_words = pad_features([encoded_words], seq_length)
padded_words = torch.from_numpy(padded_words).to(device)
if(len(padded_words) == 0):
"Your review must contain at least 1 word!"
return None
#make prediction on review
net.eval()
h = net.init_hidden(1)
output, h = net(padded_words, h)
pred = torch.round(output.squeeze())
print(pred)
sentiment = "This is a positive review." if pred == 1 else "This is a negative review."
return sentiment
review1 = "Battery is really great."
review2 = "This headset is not good."
review3 = "I didn't like this product."
review4 = "bad converter"
review5 = "In love with this device"
### OUTPUT ###
result = predict_review(review1)
print("Review 1: {}, Sentiment: {}".format(review1, result))
result = predict_review(review2)
print("Review 2: {}, Sentiment: {}".format(review2, result))
result = predict_review(review3)
print("Review 3: {}, Sentiment: {}".format(review3, result))
result = predict_review(review4)
print("Review 4: {}, Sentiment: {}".format(review4, result))
result = predict_review(review5)
print("Review 5: {}, Sentiment: {}".format(review5, result))
```
%% Output
tensor(1., grad_fn=<RoundBackward>)
Review 1: Battery is really great., Sentiment: This is a positive review.
tensor(1., grad_fn=<RoundBackward>)
Review 2: This headset is not good., Sentiment: This is a positive review.
tensor(0., grad_fn=<RoundBackward>)
Review 3: I didn't like this product., Sentiment: This is a negative review.
tensor(0., grad_fn=<RoundBackward>)
Review 4: bad converter, Sentiment: This is a negative review.
tensor(1., grad_fn=<RoundBackward>)
Review 5: In love with this device, Sentiment: This is a positive review.
......
This diff is collapsed.
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Output
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
%% Cell type:code id: tags:
``` python
#import all necessary libraries here
import os
import numpy as np
import pandas as pd
import string
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
```
%% Output
Using TensorFlow backend.
%% Cell type:code id: tags:
``` python
os.chdir('/content/drive/My Drive')
input_file_path = '/content/drive/My Drive/amazon_cells_labelled.txt'
```
%% Cell type:code id: tags:
``` python
# split reviews and labels from yelp_labelled.txt
# split reviews and labels from amazon_cells_labelled.txt
amazonData = pd.read_csv(input_file_path, delimiter='\t', header=None, names = ['Review' , 'Sentiment'])
amazonData.head()
```
%% Output
Review Sentiment
0 So there is no way for me to plug it in here i... 0
1 Good case, Excellent value. 1
2 Great for the jawbone. 1
3 Tied to charger for conversations lasting more... 0
4 The mic is great. 1
%% Cell type:code id: tags:
``` python
#form reviews and labels list
reviews_list = []
labels = []
with open(input_file_path, 'r') as f:
reviews_ = f.readlines()
```
%% Cell type:code id: tags:
``` python
#required for identifying the nouns, verbs and adjectives in the reviews
import nltk
nltk.download('averaged_perceptron_tagger')
```
%% Output
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
True
%% Cell type:code id: tags:
``` python
# Prepare data for data pre-processing steps
reviews_list_without_nouns = []
reviews_list_without_adjectives = []
reviews_list_without_verbs = []
table = str.maketrans('', '', string.punctuation)
for i in range(len(reviews_)):
review, label = reviews_[i].split('\t')
review = review.replace('.', '')
stripped = [w.translate(table) for w in review.split(' ')]
review = ' '.join(stripped)
tagged_sentence = nltk.tag.pos_tag(review.split())
#remove nouns in the review
edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
review_noun_removed = ' '.join(edited_sentence)
reviews_list_without_nouns.append(review_noun_removed.lower())
#remove adjectives in the review
tagged_sentence = nltk.tag.pos_tag(review.split())
edited_sentence = [word for word,tag in tagged_sentence if tag != 'JJ']
review_adjective_removed = ' '.join(edited_sentence)
reviews_list_without_adjectives.append(review_adjective_removed.lower())
#remove verbs in the review
tagged_sentence = nltk.tag.pos_tag(review.split())
edited_sentence = [word for word,tag in tagged_sentence if tag != 'VB' and tag != 'VBD' and tag != 'VBG' and tag != 'VBN' and tag != 'VBP']
review_verb_removed = ' '.join(edited_sentence)
reviews_list_without_verbs.append(review_verb_removed.lower())
#original reviews
reviews_list.append(review.lower())
labels.append(label)
#Visualize data before applying data pre-processing techniques
print("Original reviews after removing punctuations: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list[i][:])
print(" \n Reviews after removing punctuations and nouns: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_nouns[i][:])
print(" \n Reviews after removing punctuations and adjectives: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_adjectives[i][:])
print(" \n Reviews after removing punctuations and verbs: \n")
for i in range(5):
print(str(labels[i]) + "\t: " + reviews_list_without_verbs[i][:])
```
%% Output
Original reviews after removing punctuations:
0
: so there is no way for me to plug it in here in the us unless i go by a converter
1
: good case excellent value
1
: great for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor problems
1
: the mic is great
Reviews after removing punctuations and nouns:
0
: so there is no way for me to plug it in here in the unless i go by a converter
1
: good case value
1
: for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor
1
: the mic is great
Reviews after removing punctuations and adjectives:
0
: so there is no way for me to plug it in here in the us unless i go by a converter
1
: case excellent value
1
: great for the jawbone
0
: tied to charger for conversations lasting more than 45 minutesmajor problems
1
: the mic is
Reviews after removing punctuations and verbs:
0
: so there is no way for me to it in here in the us unless i by a converter
1
: good case excellent value
1
: great for the jawbone
0
: to for conversations more than 45 minutesmajor problems
1
: the mic is great
%% Cell type:code id: tags:
``` python
reviews = '\n'.join(reviews_list)
reviews_no_nouns = '\n'.join(reviews_list_without_nouns)
reviews_no_adjectives = '\n'.join(reviews_list_without_adjectives)
reviews_no_verbs = '\n'.join(reviews_list_without_verbs)
reviews_process = ' '.join(reviews_list)
print(labels)
```
%% Output
['0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '0\n', '1\n', '1\n', '1\n', '0\n', '1\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '0\n', '0\n', '0\n', '1\n', '1\n', '1\n', '1\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n', '0\n']
%% Cell type:code id: tags:
``` python
# remove punctuations
def text_without_punct(reviews):
return reviews.translate(str.maketrans('', '', string.punctuation))
#original reviews without punctuation as a string
no_punct_text = text_without_punct(reviews)
reviews_split = reviews.split('\n')
# print("original reviews after split: \n", reviews_split[:100])
#reviews without punctuation and nouns as a string
no_punct_text_and_nouns = text_without_punct(reviews_no_nouns)
reviews_split_no_nouns = reviews_no_nouns.split('\n')
#reviews without punctuation and adjectives as a string
no_punct_text_and_adjectives = text_without_punct(reviews_no_adjectives)
reviews_split_no_adjectives = reviews_no_adjectives.split('\n')
#reviews without punctuation and verbs as a string
no_punct_text_and_verbs = text_without_punct(reviews_no_verbs)
reviews_split_no_verbs = reviews_no_verbs.split('\n')
```
%% Cell type:code id: tags:
``` python
print("original reviews after split: \n", reviews_split[0])
print("reviews with noun removed and split \n ", reviews_split_no_nouns[0])
print("reviews with adjectives removed and split \n ", reviews_split_no_adjectives[0])
print("reviews with verbs removed and split \n ", reviews_split_no_verbs[0])
```
%% Output
original reviews after split:
so there is no way for me to plug it in here in the us unless i go by a converter
reviews with noun removed and split
so there is no way for me to plug it in here in the unless i go by a converter
reviews with adjectives removed and split
so there is no way for me to plug it in here in the us unless i go by a converter
reviews with verbs removed and split
so there is no way for me to it in here in the us unless i by a converter
%% Cell type:code id: tags:
``` python
# Split the formatted no_punct_text into words
def split_in_words(no_punct_text):
return no_punct_text.split()
words = split_in_words(no_punct_text)
print("words after splitting: ", words[:50])
```
%% Output
words after splitting: ['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter', 'good', 'case', 'excellent', 'value', 'great', 'for', 'the', 'jawbone', 'tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutesmajor', 'problems', 'the', 'mic', 'is', 'great', 'i', 'have', 'to', 'jiggle', 'the', 'plug']
%% Cell type:code id: tags:
``` python
# print the total length of the words
print("Total number of words {}".format(len(words)))
# Total number of unique words
print("Total number of unique words {}".format(len(set(words))))
```
%% Output
Total number of words 10196
Total number of unique words 1905
%% Cell type:code id: tags:
``` python
# Stop word removal
from spacy.lang.en.stop_words import STOP_WORDS
words = [word for word in words if word not in STOP_WORDS]
```
%% Cell type:code id: tags:
``` python
# Clean up original reviews
clean_reviews = []
for review in reviews_list:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews.append(' '.join(review_words))
#original reviews
print("original reviews \n")
print (reviews_list[0][:100])
print (clean_reviews[0][:100])
# Clean up reviews without nouns
clean_reviews_no_nouns = []
for review in reviews_list_without_nouns:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_nouns.append(' '.join(review_words))
#reviews after nouns removed
print("\nreviews after nouns removed \n")
print (reviews_list_without_nouns[0][:100])
print (clean_reviews_no_nouns[0][:100])
# Clean up reviews without adjectives
clean_reviews_no_adjectives = []
for review in reviews_list_without_adjectives:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_adjectives.append(' '.join(review_words))
#reviews after adjectives removed
print("\nreviews after adjectives removed \n")
print (reviews_list_without_adjectives[0][:100])
print (clean_reviews_no_adjectives[0][:100])
# Clean up reviews without verbs
clean_reviews_no_verbs = []
for review in reviews_list_without_verbs:
review_words = review.split(' ')
review_words = [word for word in review_words if word not in string.punctuation and word not in STOP_WORDS]
clean_reviews_no_verbs.append(' '.join(review_words))
#reviews after verbs removed
print("\nreviews after verbs removed \n")
print (reviews_list_without_verbs[0][:100])
print (clean_reviews_no_verbs[0][:100])
```
%% Output
original reviews
so there is no way for me to plug it in here in the us unless i go by a converter
way plug converter
reviews after nouns removed
so there is no way for me to plug it in here in the unless i go by a converter
way plug converter
reviews after adjectives removed
so there is no way for me to plug it in here in the us unless i go by a converter
way plug converter
reviews after verbs removed
so there is no way for me to it in here in the us unless i by a converter
way converter
%% Cell type:code id: tags:
``` python
print("Total number of unique words after stop words removed : {}".format(len(set(words))))
```
%% Output
Total number of unique words after stop words removed : 1695
%% Cell type:code id: tags:
``` python
## Count all the words and maintain a dictionary
def word_count(words):
return Counter(words)
counts=word_count(words)
```
%% Cell type:code id: tags:
``` python
# Check for count of some words
print (counts['converter'])
```
%% Output
1
%% Cell type:code id: tags:
``` python
# define a vocabulary for the words after clean-up
def vocabulary(counts):
return list(counts.keys())
vocab = vocabulary(counts)
vocab[1]
```
%% Output
'plug'
%% Cell type:code id: tags:
``` python
# map each vocab word to an integer
def vocabulary_to_integer(vocab):
return {word:number for number,word in enumerate(vocab,1)}
vocab_to_int = vocabulary_to_integer(vocab)
print(vocab_to_int)
```
%% Output
{'way': 1, 'plug': 2, 'converter': 3, 'good': 4, 'case': 5, 'excellent': 6, 'value': 7, 'great': 8, 'jawbone': 9, 'tied': 10, 'charger': 11, 'conversations': 12, 'lasting': 13, '45': 14, 'minutesmajor': 15, 'problems': 16, 'mic': 17, 'jiggle': 18, 'line': 19, 'right': 20, 'decent': 21, 'volume': 22, 'dozen': 23, 'contacts': 24, 'imagine': 25, 'fun': 26, 'sending': 27, 'razr': 28, 'owneryou': 29, 'needless': 30, 'wasted': 31, 'money': 32, 'waste': 33, 'time': 34, 'sound': 35, 'quality': 36, 'impressed': 37, 'going': 38, 'original': 39, 'battery': 40, 'extended': 41, 'seperated': 42, 'mere': 43, '5': 44, 'ft': 45, 'started': 46, 'notice': 47, 'excessive': 48, 'static': 49, 'garbled': 50, 'headset': 51, 'design': 52, 'odd': 53, 'ear': 54, 'clip': 55, 'comfortable': 56, 'highly': 57, 'recommend': 58, 'blue': 59, 'tooth': 60, 'phone': 61, 'advise': 62, 'fooled': 63, 'far': 64, 'works': 65, 'clicks': 66, 'place': 67, 'makes': 68, 'wonder': 69, 'long': 70, 'mechanism': 71, 'went': 72, 'motorolas': 73, 'website': 74, 'followed': 75, 'directions': 76, 'pair': 77, 'bought': 78, 'use': 79, 'kindle': 80, 'fire': 81, 'absolutely': 82, 'loved': 83, 'commercials': 84, 'misleading': 85, 'run': 86, 'new': 87, 'bars': 88, 'thats': 89, 'days': 90, 'charging': 91, 'mother': 92, 'problem': 93, 'pocket': 94, 'pc': 95, 'combination': 96, 'ive': 97, 'owned': 98, '7': 99, 'months': 100, 'best': 101, 'mobile': 102, 'didnt': 103, 'think': 104, 'instructions': 105, 'provided': 106, 'helpful': 107, 'people': 108, 'couldnt': 109, 'hear': 110, 'talk': 111, 'pull': 112, 'earphone': 113, 'doesnt': 114, 'hold': 115, 'charge': 116, 'simple': 117, 'little': 118, 'breakage': 119, 'unacceptible': 120, 'product': 121, 'ideal': 122, 'like': 123, 'ears': 124, 'sensitive': 125, 'unusable': 126, 'moving': 127, 'car': 128, 'freeway': 129, 'speed': 130, 'years': 131, 'left': 132, 'contract': 133, 'hate': 134, 'ac': 135, 'included': 136, 'sure': 137, 'juicehighy': 138, 'recommended': 139, 'need': 140, '3': 141, 'mins': 142, 'book': 143, 'turn': 144, 'phonebattery': 145, 'life': 146, 'short': 147, 'kept': 148, 'poor': 149, 'performance': 150, 'fine': 151, '680': 152, 'worthless': 153, 'camera': 154, '2mp': 155, 'pics': 156, 'nice': 157, 'clear': 158, 'picture': 159, 'priced': 160, 'garbage': 161, 'audio': 162, 'bluetooth': 163, 'features': 164, 'want': 165, 'mind': 166, 'gonna': 167, 'buy': 168, 'arguing': 169, 'verizon': 170, 'dropped': 171, 'calls': 172, 'returned': 173, 'phones': 174, 'disappointed': 175, 'loud': 176, 'protection': 177, 'bulky': 178, 'usable': 179, 'keyboard': 180, 'actually': 181, 'turns': 182, 'pda': 183, 'realworld': 184, 'useful': 185, 'machine': 186, 'instead': 187, 'neat': 188, 'gadget': 189, 'pretty': 190, 'sturdy': 191, 'large': 192, 'love': 193, 'thing': 194, 'reasonable': 195, 'price': 196, 'ie': 197, 'stream': 198, 'submerged': 199, '15': 200, 'seconds': 201, 'happy': 202, '510': 203, 'complaints': 204, 'end': 205, 'buttons': 206, 'bad': 207, 'essentially': 208, 'forget': 209, 'microsofts': 210, 'tech': 211, 'support': 212, 'faceplates': 213, 'looks': 214, 'elegant': 215, 'cool': 216, 'headphones': 217, 'find': 218, 'purchase': 219, 'seriously': 220, 'different': 221, 'particular': 222, 'angle': 223, 'party': 224, 'clearly': 225, 'big': 226, 'drawback': 227, 'mp3': 228, 'player': 229, 'cover': 230, 'let': 231, 'pause': 232, 'skip': 233, 'songs': 234, 'lock': 235, 'week': 236, 'later': 237, 'activated': 238, 'suddenly': 239, 'died': 240, 'feels': 241, 'headsets': 242, 'wear': 243, 'glasses': 244, 'gets': 245, 'ipods': 246, 'device': 247, 'situations1': 248, 'work': 249, 'bmw': 250, 'series': 251, 'fairly': 252, 'quiet': 253, 'trouble': 254, 'hearing': 255, 'person': 256, 'saying': 257, 'choice': 258, 'docking': 259, 'station': 260, 'home': 261, 'beautiful': 262, 'd807wrongly': 263, 'advertised': 264, 'd807': 265, 'item': 266, 'handy': 267, 'lot': 268, 'purchased': 269, '2': 270, 'longer': 271, 'working': 272, 'everyday': 273, 'holds': 274, 'bargain': 275, 'packaged': 276, 'arrived': 277, 'intended': 278, 'runs': 279, 'quickly': 280, 'worked': 281, 'broke': 282, '6': 283, 'easy': 284, 'loves': 285, 'construction': 286, 'better': 287, 'boy': 288, 'cheaper': 289, 'loads': 290, 'super': 291, 'costs': 292, 'expect': 293, 'greater': 294, 'ease': 295, 'buds': 296, 'play': 297, 'music': 298, 'dont': 299, 'order': 300, 'plan': 301, 'found': 302, 'waaay': 303, 'tried': 304, 'bluetooths': 305, 'listener': 306, 'im': 307, 'decision': 308, 'integrated': 309, 'seamlessly': 310, 'motorola': 311, 'buyer': 312, 'beware': 313, 'flush': 314, 'toilet': 315, 'definitely': 316, 'free': 317, 'shipping': 318, 'received': 319, 'supposedly': 320, '375': 321, 'apparently': 322, 'match': 323, 'prosgood': 324, 'pictures': 325, 'styles': 326, 'black': 327, 'white': 328, 'huge': 329, 'flaw': 330, 'correctly': 331, '350': 332, 'jabra350': 333, 'reception': 334, 'piece': 335, 'fit': 336, 'rated': 337, 'impressive': 338, '13': 339, 'megapixels': 340, 'renders': 341, 'images': 342, 'fall': 343, 'expectations': 344, 'relatively': 345, 'high': 346, 'resolution': 347, 'purcashed': 348, 'wife': 349, 'ask': 350, 'slim': 351, 'light': 352, 'display': 353, 'geeky': 354, 'sex': 355, 'toast': 356, 'rocks': 357, 'oozes': 358, 'embedded': 359, 'sleek': 360, 'stylish': 361, 'leather': 362, 'fast': 363, 'compromise': 364, 'qwerty': 365, 'basic': 366, 'cell': 367, 'number': 368, 'keypad': 369, 'got': 370, 'completely': 371, 'unhappy': 372, 'winner': 373, 'setup': 374, 'simpler': 375, 'earpieces': 376, 'jabra': 377, 'fits': 378, 'comfortably': 379, 'strong': 380, 'signal': 381, 'iam': 382, 'pleased': 383, 'job': 384, 'basically': 385, 'service': 386, 'set': 387, 'weeks': 388, 'bt': 389, 'disapoinment': 390, 'small': 391, 'realize': 392, 'getting': 393, 'accompanied': 394, 'software': 395, 'brilliant': 396, 'nicely': 397, 'avoid': 398, 'damage': 399, 'definitly': 400, 'buyerbe': 401, 'careful': 402, 'majority': 403, 'logitech': 404, 'earbud': 405, 'failed': 406, 'stuff': 407, 'peachykeen': 408, 'house': 409, 'coverage': 410, 'upstairs': 411, 'basement': 412, 'voice': 413, 'recognition': 414, 'tremendous': 415, 'minute': 416, 'experienced': 417, 'drops': 418, 'area': 419, 'takes': 420, 'forever': 421, 'hours': 422, 'literally': 423, 'reccomendation': 424, 'relative': 425, 'glad': 426, 'items': 427, 'stated': 428, 'description': 429, 'screen': 430, 'sudden': 431, 'hoping': 432, 'linking': 433, '8530': 434, 'blackberry': 435, 'curve': 436, 'know': 437, 'sounds': 438, 'funny': 439, 'sketchy': 440, 'technology': 441, 'wouldnt': 442, 'wellwell': 443, 'wired': 444, 'kind': 445, 'messages': 446, 'web': 447, 'browsing': 448, 'significantly': 449, 'faster': 450, 'previous': 451, 'build': 452, 'unlike': 453, 'cheap': 454, 's': 455, 'fantastic': 456, 'perfectly': 457, 'colors': 458, 'w810i': 459, 'superb': 460, 'whine': 461, 'internet': 462, 'goesthe': 463, 'communications': 464, 'tool': 465, 'communicate': 466, 'charm': 467, 'maintain': 468, 'monkeys': 469, 'shouldnt': 470, 'obviously': 471, 'share': 472, 'dna': 473, 'copy': 474, 'humans': 475, 'bougth': 476, 'l7c': 477, 'look': 478, 'sharp': 479, 'graphics': 480, 'mode': 481, 'button': 482, 'thank': 483, 'wasting': 484, 'bethe': 485, 'igo': 486, 'chargers': 487, 'tips': 488, 'file': 489, 'browser': 490, 'offers': 491, 'options': 492, 'needshandsfree': 493, 'network': 494, 'connected': 495, 'wifes': 496, 'bluetoothmotorola': 497, 'hs850': 498, 'latest': 499, 'os': 500, 'v115g': 501, 'likes': 502, 'slow': 503, 'crawl': 504, 'recognizes': 505, 'storage': 506, 'buzzing': 507, 'override': 508, 'bluetoooth': 509, 'functionality': 510, 'awesome': 511, 'thorn': 512, 'abhor': 513, 'recently': 514, 'stay': 515, '10': 516, 'minutes': 517, 'disconnected': 518, 'incredible': 519, 'bucks': 520, 'check': 521, 'mail': 522, 'night': 523, 'backlight': 524, 'message': 525, 'lost': 526, 'replacement': 527, 'ring': 528, 'toneoverall': 529, 'lately': 530, 'extremely': 531, 'wit': 532, 'hit': 533, 'dropping': 534, 'weight': 535, 'hardly': 536, 'youll': 537, 'thin': 538, 'pleather': 539, 'useless': 540, 'simply': 541, 'deaf': 542, 'color': 543, 'prettier': 544, 'thought': 545, 'incredibly': 546, 'investment': 547, 'strange': 548, 'ticking': 549, 'noises': 550, 'ends': 551, 'electronics': 552, 'available': 553, 'fm': 554, 'transmitters': 555, 'lasts': 556, 'h500': 557, '12': 558, 'mega': 559, 'pixel': 560, 'reasonably': 561, 'good7': 562, 'nearly': 563, 'transmit': 564, 'bother': 565, 'contacting': 566, 'company': 567, 'dollar': 568, 'learned': 569, 'lesson': 570, 'form': 571, 'online': 572, 'earbugs': 573, 'means': 574, 'range': 575, 'able': 576, 'roam': 577, 'living': 578, 'room': 579, 'receptionsound': 580, 'issues': 581, 'felt': 582, 'crack': 583, 'worst': 584, 'infatuated': 585, 'freezes': 586, 'frequently4': 587, 'embarrassing': 588, 'childlike': 589, 'lightweight': 590, 'id': 591, 'expected': 592, 'consumer': 593, 'experience': 594, 'theres': 595, 'horrible': 596, 'tick': 597, 'background': 598, 'certainly': 599, 'usually': 600, 'headbands': 601, 'mess': 602, 'hair': 603, 'bit': 604, 'year': 605, 'tell': 606, 'ordered': 607, 'sony': 608, 'ericsson': 609, 'favorite': 610, 'purchases': 611, 'market': 612, 'authentic': 613, 'shine': 614, 'comfort': 615, 'excited': 616, 'cute': 617, 'mistake': 618, 'disappointment': 619, 'calendar': 620, 'sync': 621, 'customer': 622, 'additional': 623, 'gels': 624, 'whatsoever': 625, 'defeats': 626, 'purpose': 627, 'worth': 628, 'penny': 629, 'wallet': 630, 'type': 631, 'excrutiatingly': 632, 'probably': 633, 'important': 634, 'aspect': 635, 'glove': 636, 'secure': 637, 'durable': 638, 'o': 639, 'gosh': 640, 'attractive': 641, 'appears': 642, 'factor': 643, 'rubberpetroleum': 644, 'smell': 645, 'unbearable': 646, 'caused': 647, 'return': 648, 'cable': 649, 'flimsy': 650, 'scary': 651, 'earpiece': 652, 'hands': 653, 'stereo': 654, 'month': 655, 'flawlessly': 656, 'absolutel': 657, 'junk': 658, 'real': 659, '8': 660, 'drain': 661, 'potentially': 662, 'fry': 663, 'unreliable': 664, 'giving': 665, 'gave': 666, 'stars': 667, 'reversible': 668, 'rotating': 669, 'feature': 670, 'family': 671, 'seller': 672, 'plantronics': 673, 'adorable': 674, 'buying': 675, 'poorly': 676, 'contstruct': 677, 'hinge': 678, 'installed': 679, 'charged': 680, 'overnite': 681, 'handset': 682, 'cat': 683, 'attacked': 684, 'scratched': 685, 'protective': 686, 'strip': 687, 'destroying': 688, 'terrible': 689, 'razor': 690, 'v3i': 691, 'wise': 692, 'shouldve': 693, 'invented': 694, 'sooner': 695, 'trythe': 696, 'engineered': 697, 'clever': 698, 'complained': 699, 'microphone': 700, 'weak': 701, '5year': 702, 'old': 703, 'nokia': 704, '2160': 705, 'tracfone': 706, 'care': 707, 'instruction': 708, 'manual': 709, 'lacking': 710, 'alarm': 711, 'clock': 712, 'removing': 713, 'antena': 714, 'uncomfortable': 715, 'compared': 716, 'plugged': 717, 'lg': 718, 'gotten': 719, 'compliments': 720, 'state': 721, 'allow': 722, 'usage': 723, 'driving': 724, 'immediately': 725, 'ngage': 726, 'earbuds': 727, 'dialing': 728, '23': 729, 'cant': 730, 'low': 731, 'howeverthe': 732, 'riingtones': 733, 'games': 734, 'amazon': 735, 'sucks': 736, 'rip': 737, 'came': 738, 'ago': 739, 'couple': 740, 'ipod': 741, '1': 742, 'recharge': 743, 'frequentyly': 744, 'flip': 745, 'phones2': 746, 'adhesive': 747, 'disappointing': 748, 'inexpensive': 749, 'practically': 750, 'add': 751, 'boost': 752, 'times': 753, 'concrete': 754, 'knock': 755, 'wood': 756, 'transformed': 757, 'organizational': 758, 'capability': 759, 'easier': 760, 'sitting': 761, 'vehicle': 762, 'cradle': 763, 'belt': 764, 'jerks': 765, 'los': 766, 'angeles': 767, 'starter': 768, 'wireless': 769, 'loudspeaker': 770, 'option': 771, 'bumpers': 772, 'lights': 773, 'appealing': 774, 'improve': 775, 'leaks': 776, 'hot': 777, 'according': 778, 'called': 779, 'applifies': 780, 'save': 781, 'specially': 782, 'face': 783, 'transmission': 784, 's11': 785, 'data': 786, 'finished': 787, 'looking': 788, 'happier': 789, 'ill': 790, 'drivng': 791, 'starts': 792, 'ringing': 793, 'reason': 794, 'having': 795, 'auto': 796, 'reverse': 797, 'tape': 798, 'embarassing': 799, 'hurt': 800, 'try': 801, 'push': 802, 'protects': 803, 'sides': 804, 'average': 805, 'operates': 806, 'skype': 807, 'soyo': 808, 'self': 809, 'portraits': 810, 'outside': 811, 'exterior': 812, 'mentioned': 813, 'trying': 814, 'handsfree': 815, 'gadgets': 816, 'finally': 817, 'magical': 818, 'help': 819, 'crap': 820, 'shipped': 821, 'promptly': 822, 'exactly': 823, 'wanted': 824, 'comparablypriced': 825, 'offering': 826, 'today': 827, 'deal': 828, 'satisfied': 829, 'encourage': 830, 'youd': 831, 'effective': 832, 'waiting': 833, 'recieve': 834, 'prompt': 835, 'especially': 836, 'stupid': 837, 'cradles': 838, 'kits': 839, 'comes': 840, 'excelent': 841, 'cingulair': 842, 'nicer': 843, 'noticed': 844, 'era': 845, 'colored': 846, 'goes': 847, 'dead': 848, 'hoursthe': 849, 'thereplacement': 850, '2000': 851, 'cheaply': 852, 'att': 853, 'distorted': 854, 'yell': 855, 'plastic': 856, 'breaks': 857, 'oh': 858, 'forgot': 859, 'mention': 860, 'weird': 861, 'effect': 862, 'iriver': 863, 'spinn': 864, 'unit': 865, 'fond': 866, 'magnetic': 867, 'strap': 868, 'overall': 869, 'psyched': 870, 'appointments': 871, 'note': 872, 'appearance': 873, 'bland': 874, 'model': 875, 'sanyo': 876, 'survived': 877, 'dozens': 878, 'blacktop': 879, 'earphones': 880, 'finds': 881, 'away': 882, 'enter': 883, 'modest': 884, 'cellular': 885, 'clarity': 886, 'warning': 887, 'wish': 888, 'awsome': 889, 'drained': 890, 'dying': 891, 'earpad': 892, 'onlyi': 893, 'displeased': 894, 'defect': 895, 'risk': 896, 'built': 897, 'difficult': 898, 'install': 899, 'restored': 900, 'purchasing': 901, 'jx10': 902, 'moto': 903, 'q': 904, 'figure': 905, 'searched': 906, 'size': 907, 'key': 908, 'pad': 909, 'lit': 910, 'hard': 911, 'wasnt': 912, 'portable': 913, 'colleague': 914, 'receptiona': 915, 'expensive': 916, 'fully': 917, 'bed': 918, 'turned': 919, 'wifi': 920, '20': 921, 'morning': 922, 'reading': 923, 'memory': 924, 'card': 925, 'wearing': 926, 'hat': 927, 'sunglasses': 928, 'timely': 929, 'shipment': 930, 'solid': 931, 'surefire': 932, 'gx2': 933, 'bt50': 934, 'computer': 935, 'buyers': 936, 'remorse': 937, 'accessoryone': 938, 'inexcusable': 939, 'returning': 940, 'changing': 941, 'carriers': 942, 'tmobile': 943, 'update': 944, 'procedure': 945, 'cumbersome': 946, 'delivery': 947, 'vx9900': 948, 'env': 949, 'switch': 950, 'rocketed': 951, 'destination': 952, 'unknown': 953, 'longwearing': 954, 'conditions': 955, 'worthwhile': 956, 'usefulness': 957, 'verizons': 958, 'bills': 959, 'understand': 960, 'pricing': 961, 'plans': 962, 'overnight': 963, 'batteries': 964, 'wont': 965, 'regret': 966, 'user': 967, 'friendly': 968, 'unfortunately': 969, 'ability': 970, 'receiving': 971, 'pitiful': 972, 'respect': 973, 'exchanged': 974, 'results': 975, 'stuck': 976, 'max': 977, 'mute': 978, 'hybrid': 979, 'palmtopcameracellphone': 980, 'excels': 981, 'roles': 982, 'bt250v': 983, 'liked': 984, 'wrong': 985, 'described': 986, '11': 987, 'bose': 988, 'noise': 989, 'cancelling': 990, 'amazing': 991, 'nyc': 992, 'commuter': 993, 'defective': 994, 'given': 995, 'star': 996, 'unacceptableunless': 997, 'holster': 998, 'photo': 999, 'ad': 1000, 'greatno': 1001, 'earlier': 1002, 'review': 1003, 'noted': 1004, 'happens': 1005, 'frog': 1006, 'eye': 1007, 'catching': 1008, 'pushed': 1009, 'function': 1010, 'amazed': 1011, 'aluminum': 1012, 'palm': 1013, 'vx': 1014, 'wellit': 1015, 'protected': 1016, 'handheld': 1017, 'tools': 1018, 'sturdiness': 1019, 'orders': 1020, 'timeframe': 1021, 'source': 1022, 'waterproof': 1023, 'complaint': 1024, 'standard': 1025, '5of': 1026, 'thanks': 1027, 'things': 1028, 'ended': 1029, 'sliding': 1030, 'edge': 1031, 'pants': 1032, 'pockets': 1033, 'store': 1034, 'ugly': 1035, 'shield': 1036, 'incrediable': 1037, 'improvement': 1038, 'refuse': 1039, 'refund': 1040, 'replace': 1041, 'accidentally': 1042, 'activate': 1043, 'gentletouch': 1044, 'touch': 1045, 'listening': 1046, 'threw': 1047, 'window': 1048, 'took': 1049, 'drop': 1050, 'inches': 1051, 'kitchen': 1052, 'counter': 1053, 'crackedi': 1054, 'laughing': 1055, 'trunk': 1056, 'carried': 1057, 'conversation': 1058, 'hitch': 1059, 'practical': 1060, 'ample': 1061, 'eargels': 1062, 'channel': 1063, 'directly': 1064, 'increase': 1065, 'ones': 1066, 'properly': 1067, 'missed': 1068, 'numerous': 1069, 'sucked': 1070, 'shifting': 1071, 'bubbling': 1072, 'peeling': 1073, 'scratch': 1074, 'nothingi': 1075, 'droid': 1076, 'zero': 1077, 'exercise': 1078, 'frustration': 1079, 'earset': 1080, 'outgoing': 1081, 'total': 1082, 'package': 1083, 'understanding': 1084, 'patient': 1085, 'wirefly': 1086, 'stari': 1087, 'contact': 1088, 'cingularatt': 1089, 'inform': 1090, 'practice': 1091, 'aggravating': 1092, 'friends': 1093, 'enjoy': 1094, 'virgin': 1095, 'muddy': 1096, 'casing': 1097, 'wires': 1098, 'insert': 1099, 'glued': 1100, 'slid': 1101, 'isnt': 1102, 'plantronincs': 1103, 'continues': 1104, 'flawed': 1105, 'disapointing': 1106, 'fourth': 1107, 'hated': 1108, 'fixes': 1109, 'accessing': 1110, 'downloading': 1111, 'ringtones': 1112, 'performing': 1113, 'functions': 1114, 'barely': 1115, 'constantly': 1116, 'unacceptable': 1117, 'joke': 1118, 'said': 1119, 'happening': 1120, 'forced': 1121, 'stop': 1122, 'adapters': 1123, 'walked': 1124, 'procedures': 1125, 'reset': 1126, 'wiping': 1127, 'strength': 1128, 'plays': 1129, 'louder': 1130, 'speaker': 1131, 'constructed': 1132, 'menus': 1133, 'navigate': 1134, 'recessed': 1135, 'holding': 1136, 'onid': 1137, 'avoiding': 1138, 'brokeni': 1139, 'smoking': 1140, 'sprint': 1141, 'linked': 1142, 'effort': 1143, 'possesed': 1144, 'idea': 1145, 'trash': 1146, 'research': 1147, 'development': 1148, 'division': 1149, 'knows': 1150, 'theyre': 1151, 'killer': 1152, 'course': 1153, 'breaking': 1154, 'infuriating': 1155, 'walkman': 1156, 'charges': 1157, 'feel': 1158, 'europe': 1159, 'asia': 1160, 'clipping': 1161, 'deffinitely': 1162, '50': 1163, 'cents': 1164, 'upandcoming': 1165, 'behing': 1166, '5020': 1167, 'comfortible': 1168, '24': 1169, 'day': 1170, 'pain': 1171, 'quick': 1172, 'arrival': 1173, 'fraction': 1174, 'samsungcrap': 1175, 'crappy': 1176, 'samsung': 1177, 'e715': 1178, 'seeen': 1179, 'stopped': 1180, 'needed': 1181, 'operate': 1182, 'screenthis': 1183, 'interface': 1184, 'decade': 1185, 'compete': 1186, 'designs': 1187, 'paired': 1188, 'treo': 1189, '700w': 1190, 'usb': 1191, 'transceiver': 1192, 'steer': 1193, 'genuine': 1194, 'replacementr': 1195, 'pens': 1196, 'come': 1197, 'threepack': 1198, 'buyit': 1199, 'beats': 1200, 'fingers': 1201, 'plus': 1202, 'believe': 1203, 'steep': 1204, 'point': 1205, 'cases': 1206, 'normally': 1207, 'apart': 1208, 'haul': 1209, 'dissapointing': 1210, 'brand': 1211, 'extra': 1212, 'originally': 1213, 'discarded': 1214, 'phonesmp3': 1215, 'players': 1216, 'posted': 1217, 'detailed': 1218, 'comments': 1219, 'grey': 1220, 'red': 1221, 'pay': 1222, 'guess': 1223, 'existing': 1224, 'cds': 1225, 'connection': 1226, 'surprised': 1227, 'reviews': 1228, 'fabulous': 1229, 'currently': 1230, 'firstperson': 1231, 'shooters': 1232, 'delay': 1233, 'messes': 1234, 'bitpim': 1235, 'program': 1236, 'internetto': 1237, 'transfer': 1238, 'phonethe': 1239, 'accessory': 1240, 'manufacturer': 1241, 'performed': 1242, 'awful': 1243, 'muffled': 1244, 'tinny': 1245, 'incoming': 1246, 'severe': 1247, 'echo': 1248, 'windresistant': 1249, 'overly': 1250, 'replaceeasy': 1251, 'contacted': 1252, 'told': 1253, 'warranty': 1254, 'produce': 1255, 'receipt': 1256, 'luck': 1257, 'linksys': 1258, 'exchange': 1259, 'refurb': 1260, 'bar': 1261, 'placed': 1262, 'snug': 1263, 'heavyit': 1264, 'keeps': 1265, 'falling': 1266, 'utter': 1267, 'promised': 1268, 'loop': 1269, 'tiny': 1270, 'spring': 1271, 'latch': 1272, 'visor': 1273, 'tries': 1274, 'download': 1275, 'address': 1276, 'rebootsoverall': 1277, 'rate': 1278, 'tungsten': 1279, 'e2': 1280, 'flipphones': 1281, 'welldesigned': 1282, 'smoothly': 1283, 'study': 1284, 'interested': 1285, 'sins': 1286, 'industrial': 1287, 'happened': 1288, 'tracking': 1289, 'access': 1290, 'detachable': 1291, 'continue': 1292, 'pairing': 1293, 'periodically': 1294, 'upload': 1295, 'locks': 1296, 'screens': 1297, 'flash': 1298, 'randomly': 1299, 'locked': 1300, 'truly': 1301, '325': 1302, 'cellphone': 1303, 'wornout': 1304, 'ringer': 1305, 'choices': 1306, 'tones': 1307, 'acceptable': 1308, 'balance': 1309, 'ready': 1310, 'prime': 1311, 'coming': 1312, 'upbeat': 1313, 'chinese': 1314, 'forgeries': 1315, 'abound': 1316, 'explain': 1317, 'jack': 1318, 'ca42': 1319, 'crisp': 1320, 'smallest': 1321, 'stays': 1322, 'biggest': 1323, 'drains': 1324, 'superfast': 1325, 'ergonomic': 1326, 'theory': 1327, 'stand': 1328, 'video': 1329, 'clips': 1330, 'occupied': 1331, 'distracting': 1332, 'hour': 1333, 'entire': 1334, 'accept': 1335, 'cbr': 1336, 'mp3s': 1337, 'preferably': 1338, 'ripped': 1339, 'windows': 1340, 'media': 1341, 'beat': 1342, 'shots': 1343, 'sos': 1344, 'signals': 1345, 'allows': 1346, 'connect': 1347, 'miniusb': 1348, 'near': 1349, 'open': 1350, 'allowing': 1351, 'startac': 1352, 'regretted': 1353, 'outperform': 1354, 'china': 1355, 'v325i': 1356, 'numbers': 1357, 'sim': 1358, '3o': 1359, 'phonemy': 1360, 'r': 1361, 'crashed': 1362, 'replaced': 1363, 'quit': 1364, '18': 1365, 'iphone': 1366, '4s': 1367, 'despite': 1368, 'connecting': 1369, 'multiple': 1370, 'power': 1371, 'sources': 1372, 'imac': 1373, 'external': 1374, 'wall': 1375, 'outlet': 1376, 'etc': 1377, 'bells': 1378, 'whistles': 1379, 'mediocre': 1380, 'slide': 1381, 'grip': 1382, 'prevents': 1383, 'slipping': 1384, 'hand': 1385, 'onethis': 1386, 'span': 1387, 'exclaim': 1388, 'whoa': 1389, 'tv': 1390, 'corded': 1391, 'freedom': 1392, 'passed': 1393, 'mark': 1394, 'shows': 1395, 'signs': 1396, '100': 1397, 'functional': 1398, 'soft': 1399, 'tight': 1400, 'cut': 1401, 'shape': 1402, 'copier': 1403, 'sizes': 1404, 'sent': 1405, 'sold': 1406, 'units': 1407, 'pros': 1408, 'provides': 1409, 'classy': 1410, 'krussel': 1411, 'tracfonewebsite': 1412, 'toactivate': 1413, 'good4': 1414, 'texas': 1415, 'dit': 1416, '5320': 1417, 'mainly': 1418, 'soon': 1419, 'blueant': 1420, 'supertooth': 1421, 'metro': 1422, 'pcs': 1423, 'schr450': 1424, 'slider': 1425, 'premium': 1426, 'plugs': 1427, 'plenty': 1428, 'capacity': 1429, 'confortable': 1430, 'somewhat': 1431, 'periods': 1432, 'ant': 1433, 'hey': 1434, 'pleasantly': 1435, 'suprised': 1436, 'cost': 1437, 'dustpan': 1438, 'indoors': 1439, 'disposable': 1440, 'puff': 1441, 'smoke': 1442, 'convenient': 1443, 'ride': 1444, 'smoother': 1445, 'nano': 1446, 'itmy': 1447, 'son': 1448, 'dissapointed': 1449, 'reccommend': 1450, 'carries': 1451, 'highest': 1452, 'antiglare': 1453, 'protector': 1454, 'date': 1455, 'smartphone': 1456, 'atleast': 1457, 'addition': 1458, 'amp': 1459, 'reoccurebottom': 1460, 'cingular': 1461, 'methe': 1462, 'creaks': 1463, 'wooden': 1464, 'floor': 1465, 'apartment': 1466, 'generally': 1467, 'inconspicuous': 1468, 'boot': 1469, 'slowly': 1470, 'sorry': 1471, 'impossible': 1472, 'refused': 1473, 'upgrade': 1474, 'discount': 1475, 'securly': 1476, 'possibility': 1477, 'double': 1478, 'booking': 1479, 'break': 1480, 'entertainment': 1481, 'communication': 1482, 'managementoh': 1483, 'activesync': 1484, '42': 1485, 'optimal': 1486, 'synchronization': 1487, 'disgusting': 1488, 'coupon': 1489, 'rare': 1490, 'instance': 1491, 'perfect': 1492, 'ps3': 1493, 'cheapy': 1494, 'lots': 1495, 'sounded': 1496, 'talking': 1497, 'shouting': 1498, 'telephone': 1499, 'wind': 1500, 'yes': 1501, 'shiny': 1502, 'grtting': 1503, '744': 1504, 'v3c': 1505, 'thumbs': 1506, 'exceeds': 1507, 'feet': 1508, 'sight': 1509, 'improper': 1510, 'chargelife': 1511, 'checked': 1512, 'ordering': 1513, 'effects': 1514, 'palms': 1515, 'awkward': 1516, 'hoped': 1517, 'father': 1518, 'v265': 1519, 'pads': 1520, 'easily': 1521, 'stops': 1522, 'intermittently': 1523, 'reaching': 1524, 'row': 1525, 'send': 1526, 'keys': 1527, 'be3': 1528, 'nightmare': 1529, 'describe': 1530, 'speakerphone': 1531, 'cassette': 1532, 'current': 1533, 'cellphones': 1534, 'planning': 1535, 'says': 1536, 'dirty': 1537, 'autoanswer': 1538, 'read': 1539, 'havent': 1540, 'products': 1541, 'sensor': 1542, 'reliability': 1543, 'beeping': 1544, 'letting': 1545, 'dieing': 1546, 'laptop': 1547, 'ir': 1548, 'yearsgreat': 1549, 'cancellation': 1550, 'counterfeit': 1551, 'travled': 1552, 'swivel': 1553, 'sister': 1554, 'dualpurpose': 1555, '8125': 1556, 'keeping': 1557, 'inside': 1558, 'bottowm': 1559, 'lineanother': 1560, 'gimmick': 1561, 'opens': 1562, 'broken': 1563, 'causing': 1564, 'discomfort': 1565, 'trust': 1566, 'loudglad': 1567, 'maintains': 1568, 'flawless': 1569, 'normal': 1570, 'making': 1571, 'fails': 1572, 'wrongfirst': 1573, 'devices': 1574, 'utterly': 1575, 'confusing': 1576, 'lose': 1577, 'holder': 1578, 'cutouts': 1579, 'landline': 1580, 'loops': 1581, 'material': 1582, 'flaws': 1583, 'exceptional': 1584, 'owning': 1585, 'official': 1586, 'oem': 1587, 'loudest': 1588, 'setting': 1589, 'competitors': 1590, 'saved': 1591, 'alot': 1592, 'cuts': 1593, 'beep': 1594, 'ok': 1595, 'totally': 1596, 'unintelligible': 1597, 'word': 1598, 'restart': 1599, 'managed': 1600, 'bend': 1601, 'leaf': 1602, 'metal': 1603, 'stress': 1604, 'leopard': 1605, 'print': 1606, 'wonderfully': 1607, 'wild': 1608, 'saggy': 1609, 'floppy': 1610, 'looses': 1611, 'abovepretty': 1612, 'soundwise': 1613, 'snap': 1614, '8525': 1615, 'carry': 1616, 'fliptop': 1617, 'loose': 1618, 'wobbly': 1619, 'eventually': 1620, 'receive': 1621, 'seat': 1622, 'fulfills': 1623, 'requirements': 1624, 'fact': 1625, 'rests': 1626, 'lightly': 1627, 'websites': 1628, 'rating': 1629, 'cables': 1630, 'lap': 1631, 'controls': 1632, 'accessable': 1633, 'christmas': 1634, 'rest': 1635, 'joy': 1636, 'satisifed': 1637, '2005': 1638, 's710a': 1639, 'wow': 1640, 'specs': 1641, 'armband': 1642, 'allot': 1643, 'clearer': 1644, 'keypads': 1645, 'reach': 1646, 'ericson': 1647, 'z500a': 1648, 'motor': 1649, 'control': 1650, 'center': 1651, 'voltage': 1652, 'humming': 1653, 'equipment': 1654, 'certain': 1655, 'places': 1656, 'girl': 1657, 'complain': 1658, 'wake': 1659, 'styling': 1660, 'restocking': 1661, 'fee': 1662, 'darn': 1663, 'lousy': 1664, 'seen': 1665, 'sweetest': 1666, 'securely': 1667, 'hook': 1668, 'directed': 1669, 'canal': 1670, 'problemvery': 1671, 'unsatisfactory': 1672, 'videos': 1673, 'negatively': 1674, 'adapter': 1675, 'provide': 1676, 'hype': 1677, 'assumed': 1678, 'lense': 1679, 'covered': 1680, 'falls': 1681, 'text': 1682, 'messaging': 1683, 'tricky': 1684, 'painful': 1685, 'lasted': 1686, 'blew': 1687, 'flops': 1688, 'smudged': 1689, 'touches': 1690, 'disappoint': 1691, 'infra': 1692, 'port': 1693, 'irda': 1694, 'answer': 1695}
%% Cell type:code id: tags:
``` python
# verify if the length of created dictionary
print(len(vocab_to_int))
```
%% Output
1695
%% Cell type:code id: tags:
``` python
# make a list of words in positive reviews and in negative reviews.
positive_counts = Counter()
negative_counts = Counter()
for i in range(len(clean_reviews)):
if(str(labels[i]) == '1\n'):
for word in clean_reviews[i].split(" "):
positive_counts[word] += 1
else:
for word in clean_reviews[i].split(" "):
negative_counts[word] += 1
```
%% Cell type:code id: tags:
``` python
positive_counts.most_common()[:10]
```
%% Output
[('great', 92),
('phone', 86),
('good', 62),
('works', 46),
('product', 33),
('quality', 31),
('headset', 31),
('sound', 27),
('excellent', 26),
('price', 25)]
%% Cell type:code id: tags:
``` python
negative_counts.most_common()[:10]
```
%% Output
[('phone', 76),
('dont', 26),
('work', 25),
('battery', 23),
('product', 22),
('use', 20),
('ear', 19),
('money', 18),
('quality', 18),
('time', 16)]
%% Cell type:code id: tags:
``` python
print("Labels : {}".format(set(labels)))
```
%% Output
Labels : {'1\n', '0\n'}
%% Cell type:code id: tags:
``` python
vocab_to_int['jawbone']
```
%% Output
9
%% Cell type:code id: tags:
``` python
# 1 for positive label and 0 for negative label
def one_hot(labels):
one_hot_labels = []
for i in range(len(labels)):
if (labels[i] == '1\n'):
one_hot_labels.append(1)
else:
one_hot_labels.append(0)
return one_hot_labels
encoded_labels = one_hot(labels)
```
%% Cell type:code id: tags:
``` python
print("Length of encoded labels :{} ".format(len(encoded_labels)))
print("Length of reviews list :{} ".format(len(reviews_list)))
```
%% Output
Length of encoded labels :1000
Length of reviews list :1000
%% Cell type:code id: tags:
``` python
#prepare data that can be used for training language models
# reviews_ints = []
# for review in clean_reviews:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with nouns removed from reviews that can be used for training language models
# reviews_ints = []
# for review in clean_reviews_no_nouns:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with adjectives removed from reviews that can be used for training language models
reviews_ints = []
for review in clean_reviews_no_adjectives:
reviews_ints.append([vocab_to_int[word] for word in review.split()])
#prepare data with verbs removed from reviews that can be used for training language models
# reviews_ints = []
# for review in clean_reviews_no_verbs:
# reviews_ints.append([vocab_to_int[word] for word in review.split()])
```
%% Cell type:code id: tags:
``` python
# This step is to see if any review is empty and we remove it. Otherwise the input will be all zeroes.
review_lens = Counter([len(x) for x in reviews_ints])
empty_reviews_present = (review_lens[0]>0)
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))
```
%% Output
Zero-length reviews: 20
Maximum review length: 14
%% Cell type:code id: tags:
``` python
if empty_reviews_present:
print('Number of reviews before removing outliers: ', len(reviews_ints))
## remove any reviews/labels with zero length from the reviews_ints list.
# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])
print('Number of reviews after removing outliers: ', len(reviews_ints))
```
%% Output
Number of reviews before removing outliers: 1000
Number of reviews after removing outliers: 980
%% Cell type:code id: tags:
``` python
# Logic for padding the data
def pad_features(reviews_ints, seq_length):
return pad_sequences(reviews_ints, maxlen = seq_length)
```
%% Cell type:code id: tags:
``` python
seq_length = 200
features = pad_features(reviews_ints, seq_length=seq_length)
# print first 10 values
print(features[:10 ,:10])
```
%% Output
[[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]]
%% Cell type:code id: tags:
``` python
#Split the entire dataset into train , test and validation set
train_frac = 0.8
test_and_val_frac = 0.2
# test and val are half each of 0.2
val_frac = 0.5
test_frac = 0.5
def train_test_val_split(features):
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size = test_and_val_frac, train_size = train_frac, random_state = 5, shuffle = True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = test_frac, train_size = val_frac, random_state = 5, shuffle = True)
return X_train, X_val, X_test
def train_test_val_labels(encoded_labels):
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size = test_and_val_frac, train_size = train_frac, random_state = 5, shuffle = True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = test_frac, train_size = val_frac, random_state = 5, shuffle = True)
return y_train, y_val, y_test
train_x, val_x, test_x = train_test_val_split(features)
train_y, val_y, test_y = train_test_val_labels(encoded_labels)
```
%% Cell type:code id: tags:
``` python
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))
## print out the shapes of your resultant label data
print("\t\t\t Label:")
print("Train set labels: \t\t{}".format(train_y[:10]))
print("\nValidation set labels: \t\t{}".format(val_y[:10]))
print("\nTest set labels: \t\t{}".format(test_y[:10]))
```
%% Output
Feature Shapes:
Train set: (784, 200)
Validation set: (98, 200)
Test set: (98, 200)
Label:
Train set labels: [0 1 1 1 1 0 1 1 1 0]
Validation set labels: [0 1 0 0 1 0 0 0 1 0]
Test set labels: [1 1 1 0 0 1 0 1 0 1]
%% Cell type:code id: tags:
``` python
# create Tensor datasets for train, test and val
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 32
# SHUFFLE training data
train_loader = DataLoader(train_data, batch_size=batch_size, drop_last= True, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, batch_size=1, drop_last = True)
```
%% Cell type:code id: tags:
``` python
# obtain one batch of training data and label.
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)
```
%% Output
Sample input size: torch.Size([32, 200])
Sample input:
tensor([[ 0, 0, 0, ..., 998, 435, 745],
[ 0, 0, 0, ..., 584, 97, 1211],
[ 0, 0, 0, ..., 0, 1158, 478],
...,
[ 0, 0, 0, ..., 61, 605, 606],
[ 0, 0, 0, ..., 224, 110, 225],
[ 0, 0, 0, ..., 0, 653, 465]], dtype=torch.int32)
Sample label size: torch.Size([32])
Sample label:
tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 1, 0, 0, 0, 0, 1])
%% Cell type:code id: tags:
``` python
# Check if GPU is available.
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
print('Training on GPU.')
else:
print('No GPU available, training on CPU.')
```
%% Output
No GPU available, training on CPU.
%% Cell type:code id: tags:
``` python
class SentimentLSTM(nn.Module):
"""
The LSTM model that will be used to perform Sentiment analysis.
"""
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.3):
"""
Initialize the model by setting up the layers.
"""
super(SentimentLSTM, self).__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
# define embedding, LSTM, dropout and Linear layers here
self.embedding=nn.Embedding(vocab_size, embedding_dim)
self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout=nn.Dropout(0.6)
#Linear and sigmoid layer
self.fc1=nn.Linear(hidden_dim, 128)
self.fc2=nn.Linear(128, 64)
self.fc3=nn.Linear(64, 16)
self.fc4=nn.Linear(16,output_size)
self.sig=nn.Sigmoid()
def forward(self, x, hidden):
"""
Perform a forward pass of our model on some input and hidden state.
"""
batch_size=x.size(0)
#Embedding and LSTM output
x = x.long()
embedd=self.embedding(x)
lstm_out, hidden=self.lstm(embedd, hidden)
# stack up lstm outputs
lstm_out = lstm_out.reshape(-1, self.hidden_dim)
# dropout and fully-connected layer
out = self.dropout(lstm_out)
out=self.fc1(out)
out=self.dropout(out)
out=self.fc2(out)
out=self.dropout(out)
out=self.fc3(out)
out=self.dropout(out)
out=self.fc4(out)
# sigmoid function
sig_out = self.sig(out)
# reshape to be batch_size first
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1] # get last batch of labels
return sig_out, hidden
def init_hidden(self, batch_size):
''' Initializes hidden state '''
# Create two new tensors with sizes n_layers x batch_size x hidden_dim,
# initialized to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
```
%% Cell type:code id: tags:
``` python
# SentimentLSTM: Instantiate the model with these hyperparameters
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 1000
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)
```
%% Output
SentimentLSTM(
(embedding): Embedding(1696, 1000)
(lstm): LSTM(1000, 256, num_layers=2, batch_first=True, dropout=0.3)
(dropout): Dropout(p=0.6, inplace=False)
(fc1): Linear(in_features=256, out_features=128, bias=True)
(fc2): Linear(in_features=128, out_features=64, bias=True)
(fc3): Linear(in_features=64, out_features=16, bias=True)
(fc4): Linear(in_features=16, out_features=1, bias=True)
(sig): Sigmoid()
)
%% Cell type:code id: tags:
``` python
# loss and optimization functions
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
```
%% Cell type:code id: tags:
``` python
#Training and Validation
epochs = 2
training_loss = []
validation_loss = []
counter = 0
print_every = 1
clip=1 # gradient clipping
# move model to GPU, if available
if(train_on_gpu):
net.cuda()
net.train()
# train for some number of epochs
for e in range(epochs):
# initialize hidden state
h = net.init_hidden(batch_size)
# batch loop
for inputs, labels in train_loader:
counter += 1000
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
# Creating new variables for the hidden state, otherwise