NLP:Essential Preprocessing


Table of Content

  1. Strip Whitespace
  2. Remove Punctuation
  3. Word Tokenizer
  4. Remove Stopwords
  5. Stemming Words
In [2]:
# Load libraries
import string
import numpy as np
In [7]:
# Create text
text_data = ['   Interrobang. By Aishwarya Henriette     ',
             'Parking And Going. By Karl Gautier',
             '    Today Is The night. By Jarek Prakash   ',
            'Hi!!!! I. Love. This. Song....', 
             '10000% Agree!!!! #LoveIT', 
             'Right?!?!']

1. Strip Whitespace

In [8]:
# Strip whitespaces
text_data = [string.strip() for string in text_data]

# Show text
text_data
Out[8]:
['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash',
 'Hi!!!! I. Love. This. Song....',
 '10000% Agree!!!! #LoveIT',
 'Right?!?!']

2. Remove Punctuation

In [9]:
# Create function using string.punctuation to remove all punctuation
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

# Apply function
text_data = [remove_punctuation(sentence) for sentence in text_data]

# Show text
text_data
Out[9]:
['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash',
 'Hi I Love This Song',
 '10000 Agree LoveIT',
 'Right']

3. Word Tokenizer

In [17]:
import nltk
nltk.download("punkt")

# Load library
from nltk.tokenize import word_tokenize, sent_tokenize

# Tokenize sentences
# text_data = sent_tokenize(text_data)

# Tokenize words
text_data = [word_tokenize(sentence) for sentence in text_data]

# Show text
text_data
Out[17]:
[['Interrobang', 'By', 'Aishwarya', 'Henriette'],
 ['Parking', 'And', 'Going', 'By', 'Karl', 'Gautier'],
 ['Today', 'Is', 'The', 'night', 'By', 'Jarek', 'Prakash'],
 ['Hi', 'I', 'Love', 'This', 'Song'],
 ['10000', 'Agree', 'LoveIT'],
 ['Right']]

4. Remove Stopwords

In [19]:
# You will have to download the set of stop words the first time
import nltk
nltk.download('stopwords')

# Load library
from nltk.corpus import stopwords

# Load stop words
stop_words = stopwords.words('english')
#stop_words[:5]

# Remove stop words
text_data = [word for word in text_data if word not in stop_words]

# Show text
text_data
[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[19]:
[['Interrobang', 'By', 'Aishwarya', 'Henriette'],
 ['Parking', 'And', 'Going', 'By', 'Karl', 'Gautier'],
 ['Today', 'Is', 'The', 'night', 'By', 'Jarek', 'Prakash'],
 ['Hi', 'I', 'Love', 'This', 'Song'],
 ['10000', 'Agree', 'LoveIT'],
 ['Right']]

5. Stemming Words

Stemming reduces a word to its stem by identifying and removing affixes (e.g. gerunds) while keeping the root meaning of the word. NLTK’s PorterStemmer implements the widely used Porter stemming algorithm.

In [22]:
# Load library
from nltk.stem.porter import PorterStemmer

# Create stemmer
porter = PorterStemmer()

# Apply stemmer
for sentence in text_data:
    print([porter.stem(word) for word in sentence])
['interrobang', 'By', 'aishwarya', 'henriett']
['park', 'and', 'go', 'By', 'karl', 'gautier']
['today', 'Is', 'the', 'night', 'By', 'jarek', 'prakash']
['Hi', 'I', 'love', 'thi', 'song']
['10000', 'agre', 'loveit']
['right']