婷婷五月六月综合缴情,国产真人一级a爱做片喷水

主頁 > 知識庫 > python 爬取京東指定商品評論并進(jìn)行情感分析

python 爬取京東指定商品評論并進(jìn)行情感分析

項目地址

https://github.com/DA1YAYUAN/JD-comments-sentiment-analysis

爬取京東商城中指定商品下的用戶評論，對數(shù)據(jù)預(yù)處理后基于SnowNLP的sentiment模塊對文本進(jìn)行情感分析。

運行環(huán)境

Mac OS X
Python3.7 requirements.txt
Pycharm

運行方法

數(shù)據(jù)爬?。╦d.comment.py)

啟動jd_comment.py，建議修改jd_comment.py中變量user-agent為自己瀏覽器用戶代理
輸入京東商品完整URL
得到京東評論詞云，存放于jd_ciyun.jpg（詞云輪廓形狀存放于jdicon.jpg)
得到京東評論數(shù)據(jù)，存放于jd_comment.csv

import os
import time
import json
import random
import csv
import re

import jieba
import requests
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 詞云形狀圖片
WC_MASK_IMG = 'jdicon.jpg'
# 評論數(shù)據(jù)保存文件
COMMENT_FILE_PATH = 'jd_comment.txt'
# 詞云字體
WC_FONT_PATH = '/Library/Fonts/Songti.ttc'


def spider_comment(page=0, key=0):
    """
    爬取京東指定頁的評價數(shù)據(jù)
    :param page: 爬取第幾，默認(rèn)值為0
    """

    url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4646productId=' + key + '' \

          'score=0sortType=5page=%spageSize=10isShadowSku=0fold=1' % page
    kv = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://item.jd.com/'+ key + '.html'}#原本key不輸入值，默認(rèn)為《三體》

    try:
        r = requests.get(url, headers=kv)
        r.raise_for_status()
    except:
        print('爬取失敗')
    # 截取json數(shù)據(jù)字符串
    r_json_str = r.text[26:-2]
    # 字符串轉(zhuǎn)json對象
    r_json_obj = json.loads(r_json_str)
    # 獲取評價列表數(shù)據(jù)
    r_json_comments = r_json_obj['comments']
    # 遍歷評論對象列表
    for r_json_comment in r_json_comments:
        # 以追加模式換行寫入每條評價
        with open(COMMENT_FILE_PATH, 'a+') as file:
            file.write(r_json_comment['content'] + '\n')
        # 打印評論對象中的評論內(nèi)容
        print(r_json_comment['content'])


def batch_spider_comment():
    """
        批量爬取某東評價
        """
    # 寫入數(shù)據(jù)前先清空之前的數(shù)據(jù)
    if os.path.exists(COMMENT_FILE_PATH):
        os.remove(COMMENT_FILE_PATH)
    key = input("Please enter the address:")
    key = re.sub("\D","",key)
    #通過range來設(shè)定爬取的頁面數(shù)
    for i in range(10):
        spider_comment(i,key)
        # 模擬用戶瀏覽，設(shè)置一個爬蟲間隔，防止ip被封
        time.sleep(random.random() * 5)


def cut_word():
    """
    對數(shù)據(jù)分詞
    :return: 分詞后的數(shù)據(jù)
    """
    with open(COMMENT_FILE_PATH) as file:
        comment_txt = file.read()
        wordlist = jieba.cut(comment_txt, cut_all=False)#精確模式
        wl = " ".join(wordlist)
        print(wl)
        return wl


def create_word_cloud():
    """44144127306
    生成詞云
    :return:
    """
    # 設(shè)置詞云形狀圖片
    wc_mask = np.array(Image.open(WC_MASK_IMG))
    # 設(shè)置詞云的一些配置，如：字體，背景色，詞云形狀，大小
    wc = WordCloud(background_color="white", max_words=2000, mask=wc_mask, scale=4,
                   max_font_size=50, random_state=42, font_path=WC_FONT_PATH)
    # 生成詞云
    wc.generate(cut_word())
    # 在只設(shè)置mask的情況下,你將會得到一個擁有圖片形狀的詞云
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.figure()
    plt.show()
    wc.to_file("jd_ciyun.jpg")


def txt_change_to_csv():
    with open('jd_comment.csv', 'w+', encoding="utf8", newline='')as c:
        writer_csv = csv.writer(c, dialect="excel")
        with open("jd_comment.txt", 'r', encoding='utf8')as f:
            # print(f.readlines())
            for line in f.readlines():
                # 去掉str左右端的空格并以空格分割成list
                line_list = line.strip('\n').split(',')
                print(line_list)
                writer_csv.writerow(line_list)

if __name__ == '__main__':
    # 爬取數(shù)據(jù)
    batch_spider_comment()

    #轉(zhuǎn)換數(shù)據(jù)
    txt_change_to_csv()

    # 生成詞云
    create_word_cloud()

模型訓(xùn)練（train.py）

準(zhǔn)備正負(fù)語料集online_shopping_10_cats.csv，分別存入negative.txt和positive.txt
啟動train.py，新建文件sentiment.marshal，存入訓(xùn)練后的模型
找到外部庫中snownlp中sentiment模塊，將訓(xùn)練得到的sentiment.marshal.3文件覆蓋sentiment模塊中自帶的sentiment.marshal.3

# -*-coding:utf-8-*-

def train():
    from snownlp import sentiment
    print("開始訓(xùn)練數(shù)據(jù)集...")
    sentiment.train('negative.txt', 'positive.txt')#自己準(zhǔn)備數(shù)據(jù)集
    sentiment.save('sentiment.marshal')#保存訓(xùn)練模型
    #python2保存的是sentiment.marshal；python3保存的是sentiment.marshal.3
    "訓(xùn)練完成后，將訓(xùn)練完的模型，替換sentiment中的模型"

def main():
    train()  # 訓(xùn)練正負(fù)向商品評論數(shù)據(jù)集
    print("數(shù)據(jù)集訓(xùn)練完成！")

if __name__ == '__main__':
    main()

情感分析（sentiment.analysis.py）

啟動sentiment.analysis.py
開始對jd_comment.csv中評論進(jìn)行數(shù)據(jù)處理，處理后文件存入processed_comment_data.csv
sentiment模塊根據(jù)sentiment.marshal.3對評論進(jìn)行情感評分，評分結(jié)果存入result.csv
評分結(jié)果可視化，生成文件fig.png

from snownlp import sentiment
import pandas as pd
import snownlp
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

#from word_cloud import word_cloud_creation, word_cloud_implementation, word_cloud_settings

def read_csv():
    '''讀取商品評論數(shù)據(jù)文件'''
    comment_data = pd.read_csv('jd_comment.csv', encoding='utf-8',
                               sep='\n', index_col=None)
    #返回評論作為參數(shù)
    return comment_data


def clean_data(data):
    '''數(shù)據(jù)清洗'''
    df = data.dropna()  # 消除缺失數(shù)據(jù) NaN為缺失數(shù)據(jù)
    df = pd.DataFrame(df.iloc[:, 0].unique())  # 數(shù)據(jù)去重
    return df
    # print('數(shù)據(jù)清洗后：', len(df))


def clean_repeat_word(raw_str, reverse=False):
    '''去除評論中的重復(fù)使用的詞匯'''
    if reverse:
        raw_str = raw_str[::-1]
    res_str = ''
    for i in raw_str:
        if i not in res_str:
            res_str += i
    if reverse:
        res_str = res_str[::-1]
    return res_str


def processed_data(filename):
    '''清洗完畢的數(shù)據(jù)，并保存'''
    df = clean_data(read_csv())#數(shù)據(jù)清洗
    ser1 = df.iloc[:, 0].apply(clean_repeat_word)#去除重復(fù)詞匯
    df2 = pd.DataFrame(ser1.apply(clean_repeat_word, reverse=True))
    df2.to_csv(f'{filename}.csv', encoding='utf-8', index_label=None, index=None)


def train():
    '''訓(xùn)練正向和負(fù)向情感數(shù)據(jù)集，并保存訓(xùn)練模型'''
    sentiment.train('negative.txt', 'positive.txt')
    sentiment.save('seg.marshal')#python2保存的是sentiment.marshal；python3保存的是sentiment.marshal.3


sentiment_list = []

res_list = []


def test(filename, to_filename):
    '''商品評論-情感分析-測試'''
    with open(f'{filename}.csv', 'r', encoding='utf-8') as fr:
        for line in fr.readlines():
            s = snownlp.SnowNLP(line)
            #調(diào)用snownlp中情感評分s.sentiments
            if s.sentiments > 0.6:
                res = '喜歡'
                res_list.append(1)
            elif s.sentiments  0.4:
                res = '不喜歡'
                res_list.append(-1)
            else:
                res = '一般'
                res_list.append(0)
            sent_dict = {
                '情感分析結(jié)果': s.sentiments,
                '評價傾向': res,
                '商品評論': line.replace('\n', '')
            }
            sentiment_list.append(sent_dict)
            print(sent_dict)
        df = pd.DataFrame(sentiment_list)
        df.to_csv(f'{to_filename}.csv', index=None, encoding='utf-8',
                  index_label=None, mode='w')


def data_virtualization():
    '''分析結(jié)果可視化，以條形圖為測試樣例'''
    font = FontProperties(fname='/System/Library/Fonts/Supplemental/Songti.ttc', size=14)
    likes = len([i for i in res_list if i == 1])
    common = len([i for i in res_list if i == 0])
    unlikes = len([i for i in res_list if i == -1])

    plt.bar([1], [likes], label='喜歡')#（坐標(biāo)，評論長度，名稱）
    plt.bar([2], [common], label='一般')
    plt.bar([3], [unlikes], label='不喜歡')

    x=[1,2,3]
    label=['喜歡','一般','不喜歡']
    plt.xticks(x, label)

    plt.legend()#插入圖例
    plt.xlabel('評價種類')
    plt.ylabel('評價數(shù)目')
    plt.title(u'商品評論情感分析結(jié)果-條形圖', FontProperties=font)
    plt.savefig('fig.png')
    plt.show()
'''
def word_cloud_show():
    #將商品評論轉(zhuǎn)為高頻詞匯的詞云
    wl = word_cloud_creation('jd_comment.csv')
    wc = word_cloud_settings()
    word_cloud_implementation(wl, wc)
'''

def main():
     processed_data('processed_comment_data')#數(shù)據(jù)清洗
     #train()  # 訓(xùn)練正負(fù)向商品評論數(shù)據(jù)集

     test('jd_comment', 'result')

     print('數(shù)據(jù)可視化中...')
     data_virtualization()  # 數(shù)據(jù)可視化

     print('python程序運行結(jié)束。')

if __name__ == '__main__':
    main()

詞云輪廓圖

商品評論詞云

情感分析結(jié)果可視化

以上就是python 爬取京東指定商品評論并進(jìn)行情感分析的詳細(xì)內(nèi)容，更多關(guān)于python 爬取京東評論并進(jìn)行情感分析的資料請關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章:

python 爬取吉首大學(xué)網(wǎng)站成績單
python趣味挑戰(zhàn)之爬取天氣與微博熱搜并自動發(fā)給微信好友
python 爬取影視網(wǎng)站下載鏈接
Python爬蟲之爬取我愛我家二手房數(shù)據(jù)
python結(jié)合多線程爬取英雄聯(lián)盟皮膚(原理分析)
python爬取豆瓣電影TOP250數(shù)據(jù)
python爬取鏈家二手房的數(shù)據(jù)
教你怎么用python爬取愛奇藝熱門電影
Python爬蟲之爬取最新更新的小說網(wǎng)站
Python爬蟲實戰(zhàn)之爬取攜程評論

標(biāo)簽：駐馬店宿遷六盤水常州山東江蘇蘭州成都

巨人網(wǎng)絡(luò)通訊聲明：本文標(biāo)題《python 爬取京東指定商品評論并進(jìn)行情感分析》，本文關(guān)鍵詞 python,爬取,京東,指定,商品,；如發(fā)現(xiàn)本文內(nèi)容存在版權(quán)問題，煩請?zhí)峁┫嚓P(guān)信息告之我們，我們將及時溝通與處理。本站內(nèi)容系統(tǒng)采集于網(wǎng)絡(luò)，涉及言論、版權(quán)與本站無關(guān)。

python 爬取京東指定商品評論并進(jìn)行情感分析

目錄

項目地址

運行環(huán)境

運行方法

數(shù)據(jù)爬?。╦d.comment.py)

模型訓(xùn)練（train.py）

情感分析（sentiment.analysis.py）

詞云輪廓圖

商品評論詞云

情感分析結(jié)果可視化

四合一精品企业网站建设

¥888元限时抢购

立即咨询快速购买

企业400电话

合计11份范本：公司章程+合伙协议+出资协议+合作协议+股权转让协议+增资扩股协议+股权激励+股东会决议+董事会决议

數(shù)據(jù)爬?。╦d.comment.py)