Pertemuan 6, Text Mining

# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# download nltk corpus (first time only
import nltk
nltk.download( 'all')

# Load the amazon review dataset
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

def preprocess_text(text):
  #Tokenize the text
  token = word_tokenize(text.lower())

  #Remove stop word
  filtered_tokens = [token for token in token if token not in stopword.words('english')]

  #Lemmatize the tokens
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_token]

  #join the token back into a string
  processed_text = ' '.join(lemmatized_tokens)
  return processed_text

analyzer = SentimentIntensityAnalyzer()

#Create get_sentiment function
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment

#Apply get_sentiment function
df['sentiment'] = df['reviewText'].apply(get_sentiment)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df['Positive'], df['sentiment']))
  
from sklearn.metrics import classification_report
print(classification_report(df['Positive'], df['sentiment']))


Result :
[[ 1377  3390]
 [  620 14613]]
              precision    recall  f1-score   support

           0       0.69      0.29      0.41      4767
           1       0.81      0.96      0.88     15233

    accuracy                           0.80     20000
   macro avg       0.75      0.62      0.64     20000
weighted avg       0.78      0.80      0.77     20000
Cari Blog Ini

Rusdi Blog

Pertemuan 6, Text Mining

Komentar

Posting Komentar

Postingan populer dari blog ini

Installasi Pandas & Matplotlib Serta Dokumentasinya