新增年度关键词

2024-02-09 15:23:13 +08:00
parent 85cff9aaea
commit 163b2ef269
3 changed files with 123 additions and 12 deletions
--- a/app/analysis/analysis.py
+++ b/app/analysis/analysis.py
@@ -3,6 +3,8 @@ from collections import Counter
 import sys
 from datetime import datetime

+import jieba
+
 from app.DataBase import msg_db, MsgType
 from pyecharts import options as opts
 from pyecharts.charts import WordCloud, Calendar, Bar, Line, Pie
@@ -64,6 +66,46 @@ def wordcloud_(wxid, time_range=None):
    }


+def get_wordcloud(text):
+    total_msg_len = len(text)
+    # 使用jieba进行分词，并加入停用词
+    words = jieba.cut(text)
+    # 统计词频
+    word_count = Counter(words)
+    # 过滤停用词
+    stopwords_file = './app/data/stopwords.txt'
+    with open(stopwords_file, "r", encoding="utf-8") as stopword_file:
+        stopwords1 = set(stopword_file.read().splitlines())
+    # 构建 FFmpeg 可执行文件的路径
+    stopwords = set()
+    stopwords_file = './app/resources/data/stopwords.txt'
+    if not os.path.exists(stopwords_file):
+        resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__)))
+        stopwords_file = os.path.join(resource_dir, 'app', 'resources', 'data', 'stopwords.txt')
+    with open(stopwords_file, "r", encoding="utf-8") as stopword_file:
+        stopwords = set(stopword_file.read().splitlines())
+        stopwords = stopwords.union(stopwords1)
+
+    filtered_word_count = {word: count for word, count in word_count.items() if len(word) > 1 and word not in stopwords}
+    # 转换为词云数据格式
+    data = [(word, count) for word, count in filtered_word_count.items()]
+    # text_data = data
+    data.sort(key=lambda x: x[1], reverse=True)
+
+    text_data = data[:100] if len(data) > 100 else data
+    # 创建词云图
+    keyword, max_num = text_data[0]
+    w = (
+        WordCloud()
+        .add(series_name="聊天文字", data_pair=text_data, word_size_range=[5, 40])
+    )
+    return {
+        'chart_data_wordcloud': w.dump_options_with_quotes(),
+        'keyword': keyword,
+        'keyword_max_num': max_num,
+    }
+
+
 def wordcloud_christmas(wxid, year='2023'):
    import jieba
    txt_messages = msg_db.get_messages_by_type(wxid, MsgType.TEXT, year)
@@ -375,6 +417,8 @@ def my_message_counter(time_range, my_name=''):
    types_count = {}
    send_num = 0  # 发送消息的数量
    weekday_count = {}
+    str_content = ''
+    total_text_num = 0
    for message in msg_data:
        type_ = message[2]
        is_sender = message[4]
@@ -393,6 +437,10 @@ def my_message_counter(time_range, my_name=''):
            weekday_count[weekday] += 1
        else:
            weekday_count[weekday] = 1
+        if type_ == 1:
+            total_text_num += len(message[7])
+            if is_sender == 1:
+                str_content += message[7]
    receive_num = len(msg_data) - send_num
    data = [[types_.get(key), value] for key, value in types_count.items() if key in types_]
    if not data:
@@ -418,7 +466,7 @@ def my_message_counter(time_range, my_name=''):
        Pie()
        .add(
            "",
-            [['发送', send_num], ['接收',receive_num ]],
+            [['发送', send_num], ['接收', receive_num]],
            center=["40%", "50%"],
        )
        .set_global_opts(
@@ -428,9 +476,14 @@ def my_message_counter(time_range, my_name=''):
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}\n{d}%", position='inside'))
        # .render("./data/聊天统计/pie_scroll_legend.html")
    )
+    w = get_wordcloud(str_content)
    return {
        'chart_data_sender': p2.dump_options_with_quotes(),
        'chart_data_types': p1.dump_options_with_quotes(),
+        'chart_data_wordcloud': w.get('chart_data_wordcloud'),
+        'keyword': w.get('keyword'),
+        'keyword_max_num': w.get('keyword_max_num'),
+        'total_text_num':total_text_num,
    }