NLP: Bag of World


Bag of Word

In [1]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
In [3]:
# Create text
text_data = ['   Interrobang. By Aishwarya Henriette     ',
             'Parking And Going. By Karl Gautier',
             '    Today Is The night. By Jarek Prakash   ']

# Strip whitespaces
text_data = [string.strip() for string in text_data]

# Show text
text_data
Out[3]:
['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

Create Bag of Word

In [7]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

bag_of_words
Out[7]:
<3x15 sparse matrix of type ''
	with 17 stored elements in Compressed Sparse Row format>
In [8]:
# Show feature matrix
bag_of_words.toarray()
Out[8]:
array([[1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1]], dtype=int64)

View Bag Of Words Matrix Column Headers

In [5]:
# Get feature names
feature_names = count.get_feature_names()

# View feature names
feature_names
Out[5]:
['aishwarya',
 'and',
 'by',
 'gautier',
 'going',
 'henriette',
 'interrobang',
 'is',
 'jarek',
 'karl',
 'night',
 'parking',
 'prakash',
 'the',
 'today']

View as Dataframe

In [7]:
# Create data frame
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
Out[7]:
aishwarya and by gautier going henriette interrobang is jarek karl night parking prakash the today
0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0
1 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0
2 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1