Pertemuan 6, Text Mining
# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# download nltk corpus (first time only
import nltk
nltk.download( 'all')
# Load the amazon review dataset
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')
def preprocess_text(text):
#Tokenize the text
token = word_tokenize(text.lower())
#Remove stop word
filtered_tokens = [token for token in token if token not in stopword.words('english')]
#Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_token]
#join the token back into a string
processed_text = ' '.join(lemmatized_tokens)
return processed_text
analyzer = SentimentIntensityAnalyzer()
#Create get_sentiment function
def get_sentiment(text):
scores = analyzer.polarity_scores(text)
sentiment = 1 if scores['pos'] > 0 else 0
return sentiment
#Apply get_sentiment function
df['sentiment'] = df['reviewText'].apply(get_sentiment)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df['Positive'], df['sentiment']))
from sklearn.metrics import classification_report
print(classification_report(df['Positive'], df['sentiment']))
Result :
[[ 1377 3390]
[ 620 14613]]
precision recall f1-score support
0 0.69 0.29 0.41 4767
1 0.81 0.96 0.88 15233
accuracy 0.80 20000
macro avg 0.75 0.62 0.64 20000
weighted avg 0.78 0.80 0.77 20000
Komentar
Posting Komentar