Bachelor of Business Administration @PNU/Marketing Analytics

감성 분석 - 비지도 학습 | Sentiment Analysis - Unsupervised Learning (24.02.2022.)

Hazel Y. 2022. 4. 15. 11:21

다음은 비지도 학습을 통한 감성 분석 시 사용한 코드이다.

The following are the steps and codes for sentiment analysis using the unsupervised learning method.


1. Import necessary packages.

import pandas as pd

import matplotlib.pyplot as plt

import re

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score

 

2. Define a preprocessing function.

def data_text_cleaning(data):
 
    # 영문자 이외 문자는 공백으로 변환
    only_english = re.sub('[^a-zA-Z]', ' ', data)
 
    # 소문자 변환
    no_capitals = only_english.lower()
 
    return no_capitals

 

3. Import datasets.

suc = pd.read_csv('suc.csv')
un = pd.read_csv('un.csv')

 

4. Preprocessing

# successful product reviews
for i in range(len(suc)):
    
    review = str(suc['review'][i])
    suc['review'][i] = data_text_cleaning(review)
    
    i += 1
    
# unsuccessful product reviews
for i in range(len(un)):
    
    u_review = str(un['review'][i])
    un['review'][i] = data_text_cleaning(u_review)
    
    i += 1

 

5. Define a function to get predicted sentiment labels.

# 임계치설정(보통 0.1)을 통해 compound(총 감성지수)가 임계치값보다 높으면 긍정(1), 낮으면 부정(-1)으로 분석
def get_sentiment(review):
    
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    compound_score = scores['compound']
    final_sentiment = 1 if compound_score > 0.1 else 0 if compound_score <= 0.1 and compound_score >= -0.1 else -1
    # 1 긍정, 0 중립, -1 부정
    
    return final_sentiment

 

6. Create a column for the predicted sentiment labels(psentl).

# successful product reviews
suc['psentl'] = 0

for i in range(len(suc)):
    
    suc['psentl'][i] = get_sentiment(str(suc['review'][i]))
    
    i += 1
    
# unsuccessful product reviews
un['psentl'] = 0

for i in range(len(un)):
    
    un['psentl'][i] = get_sentiment(str(un['review'][i]))
    
    i += 1

 

7. Create a column for the real sentiment labels(sent).

# successful product reviews
suc['sent'] = 1

# 평점을 기준으로 한 실제 감성 (4,5점 긍정(1), 3점 중립(0) 1,2점 부정(-1))
for i in range(len(suc)):
    
    if suc['rating'][i] >= 4:
        suc['sent'][i] = 1
        
    else:
        if suc['rating'][i] == 3:
            suc['sent'][i] = 0
        
        else:
            suc['sent'][i] = -1
    
    i += 1
    
# unsuccessful product reviews
un['sent'] = 1

for i in range(len(un)):
    
    if un['rating'][i] >= 4:
        un['sent'][i] = 1
        
    else:
        if un['rating'][i] == 3:
            un['sent'][i] = 0
        
        else:
            un['sent'][i] = -1
    
    i += 1

 

8. Compare the predicted and real sentiment labels and compute accuracy, precision, recall and F1 scores as well as a confusion matrix.

# successful product reviews
# 원본 데이터에서 주어진 정답 label과 VADER로 예측한 label 비교
s_target = suc['sent']
s_pred = suc['psentl']

print(confusion_matrix(s_target, s_pred))
print("정확도 :", accuracy_score(s_target, s_pred))
print("정밀도 :", precision_score(s_target, s_pred, average='weighted'))
print("재현율 :", recall_score(s_target, s_pred, average='weighted'))
print("F1 score :", f1_score(s_target, s_pred, average='weighted')) # micro, macro, weighted, samples

  • Average parameter (default: binary)
    • micro: compute a score globally with total true positives, false negatives and false positives
    • macro: compute a score for each label (unweighted mean)
    • weighted: compute a score for each label (weighted mean)
    • samples: compute a score for each instance

# unsuccessful product reviews
u_target = un['sent']
u_pred = un['psentl']

print(confusion_matrix(u_target, u_pred))
print("정확도 :", accuracy_score(u_target, u_pred))
print("정밀도 :", precision_score(u_target, u_pred, average='weighted'))
print("재현율 :", recall_score(u_target, u_pred, average='weighted'))
print("F1 score :", f1_score(u_target, u_pred, average='weighted')) # micro, macro, weighted, samples

 

9. Visualization

# successful product reviews
# 긍부정 시각화 (예측값) --> only with review text
reaction = ['positive', 'neutral', 'negative']
sp_pos = list(suc['psentl']).count(1) / len(suc)
sp_neu = list(suc['psentl']).count(0) / len(suc)
sp_neg = 1 - sp_pos - sp_neu
values = [sp_pos, sp_neu, sp_neg]

plt.bar(reaction, values, color=['b', 'g', 'r'])
plt.title('Predicted Reactions for Successful Products', fontsize=15)
# 긍부정 시각화 (실제값) --> only with review score (별점)
reaction = ['positive', 'neutral', 'negative']
sr_pos = list(suc['sent']).count(1) / len(suc)
sr_neu = list(suc['sent']).count(0) / len(suc)
sr_neg = 1 - sr_pos - sr_neu
values = [sr_pos, sr_neu, sr_neg]

plt.bar(reaction, values, color=['b', 'g', 'r'])
plt.title('Real Reactions for Successful Products', fontsize=15)
# unsuccessful product reviews
# 긍부정 시각화 (예측값) --> only with review text
reaction = ['positive', 'neutral', 'negative']
up_pos = list(un['psentl']).count(1) / len(un)
up_neu = list(un['psentl']).count(0) / len(un)
up_neg = 1 - up_pos - up_neu
values = [up_pos, up_neu, up_neg]

plt.bar(reaction, values, color=['b', 'g', 'r'])
plt.title('Predicted Reactions for Unsuccessful Products', fontsize=15)
# 긍부정 시각화 (예측값) --> only with review score (별점)
reaction = ['positive', 'neutral', 'negative']
ur_pos = list(un['sent']).count(1) / len(un)
ur_neu = list(un['sent']).count(0) / len(un)
ur_neg = 1 - ur_pos - ur_neu
values = [ur_pos, ur_neu, ur_neg]

plt.bar(reaction, values, color=['b', 'g', 'r'])
plt.title('Real Reactions for Unsuccessful Products', fontsize=15)

 

 

 

* Unauthorized copying and distribution of this post are not allowed.

* 해당 글에 대한 무단 배포 및 복사를 허용하지 않습니다.