文本分析实例---QQ聊天记录分析-白红宇

文本分析实例---QQ聊天记录分析

阅读量：5088 次

发布时间：2019-06-13

本文共 3645 字，大约阅读时间需要 12 分钟。

对QQ聊天记录进行分析，由于每天产生的聊天记录比較多，所以选取的是从2月份整月的聊天记录数据。分析要产生的结果有三个，聊天记录中发消息的人前top15。统计24小时时间段那个时间段发贴人最多，还有对消息中的热词进行抽取。

对QQ用户发贴次数进行统计，须要注意QQ导出的聊天记录格式。【年月日时分秒 QQ账号相关信息】，须要对聊天记录做解析。另外对聊天内容也要做解析。

详细思路不做详解，仅仅贴结果和部分代码。相信大家一看就明确。

统计24小时时间段QQ消息数量

能够看出每天下午3点到5点大家都非常活跃

另一个就是对讨论的话题做分析，首先要对发的消息做分词处理。去掉一个停用词，然后按词频出现的次数统计，得到例如以下结果。

第一个表示出现的词，第二个表示在某个时间段内出现的次数，总的来说，我们这个群还算是一个技术群吧。

相关部分代码：

def userProcess():    userArray = []    contentArray = LoadUserInfo.loadUser()    for userInfo in contentArray:        if(len(userInfo)==3):            userArray.append(userInfo[2])    print(len(userArray))    #Counter(words).most_common(10)    userGroupInof = Counter(userArray).most_common(15)    #print(userGroupInof)    userNameLable = []    postMessageNum = []    for key,value in userGroupInof:        userNameLable.append(key)        postMessageNum.append(value)    #performance = 3 + 10 * np.random.rand(len(people))    #error = np.random.rand(len(people))    zh_font = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')    plt.barh(np.arange(len(userNameLable)), postMessageNum,  align='center', alpha=0.4)    plt.yticks(np.arange(len(userNameLable)), userNameLable,fontproperties=zh_font)    plt.xlabel('发贴数量',fontproperties=zh_font)    plt.title('java-Endless Space群(4881914)发贴最多的15个人',fontproperties=zh_font)    plt.show()

def hourProcess():    hourArray = []    contentArray = LoadUserInfo.loadUser()    for userInfo in contentArray:        if(len(userInfo)==3):            messageDate = userInfo[1]            hourInfo = re.split('[:]',messageDate)            hourArray.append(hourInfo[0])    print(len(hourArray))    #Counter(words).most_common(10)    hour_counts = Counter(hourArray)    #对数据进行排序    sortByHour = sorted(hour_counts.items())    print(sortByHour)    postMessageLable = []    postMessageNum = []    for key,value in sortByHour:        postMessageLable.append(key)        postMessageNum.append(value)    print(postMessageLable)    print(postMessageNum)    #生成发贴柱状图    N = len(postMessageNum)    ind = np.arange(N)+0.5  # the x locations for the groups    #print(ind) #x轴上的数值    width = 0.35       # the width of the bars    fig, ax = plt.subplots()    rects = ax.bar(ind, postMessageNum, width, color='r')    # add some text for labels, title and axes ticks    ax.set_ylabel('message number')    ax.set_title('QQ message number of hour,total message ( '+ str(len(hourArray)) + ")")    ax.set_xticks(ind+width)    ax.set_xticklabels(postMessageLable)    def autolabel(rects):        # attach some text labels        for rect in rects:            height = rect.get_height()            ax.text(rect.get_x()+rect.get_width()/2., height, '%d'%int(height), ha='center', va='bottom')    autolabel(rects)    plt.show()

#对导入的文件第四列做中文分词处理#对用户发出的消息进行处理def messageProcess():    wordArray = []    contentArray = LoadMessageInfo.loadMessage()    print("processing original data ........")    for messageInfo in contentArray:        #print(messageInfo[3])        word_list = jieba.cut(messageInfo, cut_all=False)        for word in word_list:            #过滤掉短词，仅仅有一个长度的词            if(len(word)>1):                wordArray.append(word)    #print(wordArray)    print("remove stop word data ........")    jsonResource = open('./data/stopword.json','r',encoding='utf8')    stopwords = json.load(jsonResource)    #print(stopwords)    for word in wordArray:        print(word)        if (word in stopwords):            wordArray.remove(word)    #print(wordArray)    print("text is processing.......")    word_counts = Counter(wordArray)    print(word_counts)    print("processing is over")

转载于:https://www.cnblogs.com/wzjhoutai/p/7190834.html

你可能感兴趣的文章