- Sat 05 August 2017
- Data Science
- M Hendra Herviawan
- #NLP, #Python
Bag of Word¶
In [1]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
In [3]:
# Create text
text_data = [' Interrobang. By Aishwarya Henriette ',
'Parking And Going. By Karl Gautier',
' Today Is The night. By Jarek Prakash ']
# Strip whitespaces
text_data = [string.strip() for string in text_data]
# Show text
text_data
Out[3]:
Create Bag of Word¶
In [7]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
bag_of_words
Out[7]:
In [8]:
# Show feature matrix
bag_of_words.toarray()
Out[8]:
View Bag Of Words Matrix Column Headers¶
In [5]:
# Get feature names
feature_names = count.get_feature_names()
# View feature names
feature_names
Out[5]:
View as Dataframe¶
In [7]:
# Create data frame
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
Out[7]: