Bachelor of Business Administration @PNU/Marketing Analytics

LDA 토픽 모델링 | Topic Modeling using LDA (16.03.2022.)

Hazel Y. 2022. 4. 15. 22:08

다음은 LDA 토픽 모델링 시 사용한 코드이다.

The following are steps and codes for the topic modeling using LDA.


1. Import necessary packages.

import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis

import matplotlib.pyplot as plt

from nltk.corpus import stopwords

 

2. Import datasets.

suc = pd.read_csv('suc.csv')
un = pd.read_csv('un.csv')

 

3. Tokenizing

suc_rev = suc['review']
un_rev = un['review']

suc_rl = suc_rev.values.tolist()
un_rl = un_rev.values.tolist()

# tokenizing
def sent_to_words(sentences):
    
    for sentence in sentences:
        
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    
suc_words = list(sent_to_words(suc_rl))
un_words = list(sent_to_words(un_rl))

 

4. Create bigram models.

suc_bigram = gensim.models.Phrases(suc_words, min_count=5, threshold=100)
un_bigram = gensim.models.Phrases(un_words, min_count=5, threshold=100)
# the higher threshold, the fewer phrases

sbigram_mod = gensim.models.phrases.Phraser(suc_bigram)
ubigram_mod = gensim.models.phrases.Phraser(un_bigram)

  • 바이그램: 한 문서나 문장에서 자주 함께 쓰이는 두 단어
  • bigram: two words that are often used together in a document/sentence

5. Remove stop words.

def remove_stopwords(texts):
    
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    
    # remove stop words
suc_words_nostop = remove_stopwords(suc_words)
un_words_nostop = remove_stopwords(un_words)

 

6. Form bigrams.

def make_sbigrams(texts):
    
    return [sbigram_mod[doc] for doc in texts]

def make_ubigrams(texts):
    
    return [ubigram_mod[doc] for doc in texts]
    
# form bigrams
suc_words_bigrams = make_sbigrams(suc_words_nostop)
un_words_bigrams = make_ubigrams(un_words_nostop)

 

7. Lemmatization

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    '''https://spacy.io/api/annotation'''
    
    texts_out = []
    
    for sent in texts:
    
        doc = nlp(' '.join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    return texts_out
    
# initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# do lemmatization keeping only noun, adj, vb, adv
suc_lemmatized = lemmatization(suc_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
un_lemmatized = lemmatization(un_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

 

8. Create dictionaries.

# create dictionary
suc_id2word = corpora.Dictionary(suc_lemmatized)
un_id2word = corpora.Dictionary(un_lemmatized)

 

9. Create corpora

# term document frequency
suc_corpus = [suc_id2word.doc2bow(suc_text) for suc_text in suc_lemmatized]
un_corpus = [un_id2word.doc2bow(un_text) for un_text in un_lemmatized]

 

10. Compute coherence scores(to get the best one).

# successful product reviews
scoherence_values = []

for i in range(2, 15):
    
    suc_ldamodel = gensim.models.ldamodel.LdaModel(suc_corpus, num_topics=i, id2word=suc_id2word)
    scoherence_model_lda = CoherenceModel(model=suc_ldamodel, texts=suc_lemmatized, dictionary=suc_id2word)
    scoherence_lda = scoherence_model_lda.get_coherence()
    scoherence_values.append(scoherence_lda)
    
x = range(2, 15)
plt.plot(x, scoherence_values)
plt.xlabel('number of topics')
plt.ylabel('coherence score')
plt.title('Coherence Values of Successful Product Reviews')
plt.show()
# unsuccessful product reviews
ucoherence_values = []

for i in range(2, 15):
    
    un_ldamodel = gensim.models.ldamodel.LdaModel(un_corpus, num_topics=j, id2word=un_id2word)
    ucoherence_model_lda = CoherenceModel(model=un_ldamodel, texts=un_lemmatized, dictionary=un_id2word)
    ucoherence_lda = ucoherence_model_lda.get_coherence()
    ucoherence_values.append(ucoherence_lda)
    
x = range(2, 15)
plt.plot(x, ucoherence_values)
plt.xlabel('number of topics')
plt.ylabel('coherence score')
plt.title('Coherence Values of Unsuccessful Product Reviews')
plt.show()

  • 0.55와 0.7 사이의 coherence score이 좋다.
  • Coherence score between 0.55 and 0.7 is good.

11. Create a LDA model with the optimal number of topics. + Visualization

# successful product reviews
sldamodel = gensim.models.ldamodel.LdaModel(suc_corpus, num_topics=5, id2word=suc_id2word)

# visualization
suc_vis = pyLDAvis.gensim_models.prepare(sldamodel, suc_corpus, suc_id2word, sort_topics=False)
pyLDAvis.save_html(suc_vis, 'suc_vis.html')
# unsuccessful product reviews
uldamodel = gensim.models.ldamodel.LdaModel(un_corpus, num_topics=4, id2word=un_id2word)

# visualization
un_vis = pyLDAvis.gensim_models.prepare(uldamodel, un_corpus, un_id2word, sort_topics=False)
pyLDAvis.save_html(un_vis, 'un_vis.html')

 

 

 

* Unauthorized copying and distribution of this post are not allowed.

* 해당 글에 대한 무단 배포 및 복사를 허용하지 않습니다.