python实现LDA主题分类模型

小孔不爱coding

1706人浏览 · 2024-03-02 17:02:39

小孔不爱coding · 2024-03-02 17:02:39 发布

LDA（Latent Dirichlet Allocation）是一种常用的主题模型，它可以帮助我们从大量文本数据中发现隐藏的主题信息。
需要的库

import pandas as pd
import matplotlib.pyplot as plt
import jieba
import jieba.posseg as pseg
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import os
import re
import pyLDAvis.gensim_models

ScenicSpotReviewAnalysis类

class ScenicSpotReviewAnalysis:
    def __init__(self, data_path, stopwords_path):
        self.data_path = data_path # txt路径
        self.stopwords = self._load_stopwords(stopwords_path) # 停用词
        self.all_texts = self._load_data() # 加载数据
        self.dictionary = None # 初始化词典
        self.corpus = None # 初始化语料库

    def _load_stopwords(self, path):
        # 加载停用词
        with open(path, encoding="utf8") as f:
            stopwords = f.read().split("\n")
        stopwords.append("\n")
        # 添加要排除的特定词语
        extra_stopwords = ['地方', '总体', '真的', '建议','小时','一座','建议']
        stopwords.extend(extra_stopwords)
        return stopwords


    def _load_data(self):
        # 加载评论数据并预处理
        all_texts = {}
        for pathdir in os.listdir(self.data_path):
            if not pathdir.endswith(".DS_Store"):
                subfilepath = os.path.join(self.data_path, pathdir)
                month_text = []
                if os.path.isdir(subfilepath):
                    for file in os.listdir(subfilepath):
                        filepath = os.path.join(subfilepath, file)
                        with open(filepath, encoding="utf-8") as f:
                            text = f.read()
                            text = "".join(re.findall(r'[\u4e00-\u9fa5]+', text)) # 提取中文
                            words = jieba.lcut(text) # 分词
                            valid_words = [word for word in words if word not in self.stopwords]
                            month_text.append(" ".join(valid_words))
                all_texts[pathdir] = month_text
        return all_texts

    def filter_pos(self, target_texts):
        filtered_texts = []
        for sentence in target_texts:
            filtered_words = []
            words = pseg.cut(sentence)
            for word, flag in words:
                # 仅保留名词和形容词，且单词长度大于1
                if (flag.startswith('n') or flag.startswith('a')) and len(word) > 1:
                    filtered_words.append(word)
            filtered_texts.append(' '.join(filtered_words))
        return filtered_texts

    


    def train_lda_models(self, texts, num_topics_range):
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(tmp) for tmp in texts]
        for num_topics in num_topics_range:
            lda_model = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.dictionary, passes=20)
            lda_model.save(f'./lda_{num_topics}_{len(texts)}.model')

# 生成困惑度曲线图
    def plot_perplexity(self, num_topics_range):
        x_list, y_list = [], []
        for num_topics in num_topics_range:
            try:
                lda_model = LdaModel.load(f'./lda_{num_topics}_{len(self.corpus)}.model')
                perplexity = lda_model.log_perplexity(self.corpus)
                x_list.append(num_topics)
                y_list.append(perplexity)
            except Exception as e:
                print('Error:', e)
        plt.plot(x_list, y_list)
        plt.xlabel('Num Topics')
        plt.ylabel('Perplexity Score')
        plt.show()
        # elbow_point = x_list[y_list.index(min(y_list))]
        # print("建议的主题个数为:", elbow_point)

    def visualize_topics(self, num_topics):
        lda_model = LdaModel.load(f'./lda_{num_topics}_{len(self.corpus)}.model')
        top_topics = lda_model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        vis = pyLDAvis.gensim_models.prepare(lda_model, self.corpus, self.dictionary, mds='tsne', sort_topics=False)
        pyLDAvis.save_html(vis, "LDA.html")
        return vis

执行

if __name__ == '__main__':
    data_path = "./景点评论数据/景点评论数据/txt"
    stopwords_path = "./stopwords.txt"

    analysis = ScenicSpotReviewAnalysis(data_path, stopwords_path)
    phase_1 = analysis.all_texts['P1']
    num_topics_range = range(1, 16)

    # 对每个文档应用filter_pos处理
    filtered_phase_1 = [analysis.filter_pos([doc])[0] for doc in phase_1]

    analysis.train_lda_models([a.split() for a in filtered_phase_1], num_topics_range)
    analysis.plot_perplexity(num_topics_range)
    vis = analysis.visualize_topics(3) # 输出几个主题

代码运行之后会输出一张困惑度曲线图，用以确定最佳主题个数，找出变化率开始显著下降的点，这个点通常被称为肘点.
在这里插入图片描述

可视化图保存在LDA.html中
在这里插入图片描述

天启AI社区

GitCode 天启AI是一款由 GitCode 团队打造的智能助手，基于先进的LLM（大语言模型）与多智能体 Agent 技术构建，致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话，还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力，真正做到“一句话，让 Al帮你完成复杂任务”。

更多推荐