In [1]:
import json

import os

from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

plt.style.use('seaborn-white')

sns.set(style="whitegrid")

%matplotlib inline

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # for Chinese characters

2. 数据读取¶

从爬虫返回的原始数据读取并且整理,数据已经在本地且合并存为单个CSV文件,故不再多次处理

In [2]:
# def readOriData(path):
#     payloads = []
#     files = os.listdir(path)
#     for file in files:
#         if file.endswith(".json"):
#             filePath = path + "/" + file
#             data_str = open(filePath)
#             obj = json.load(data_str)
#             payload = obj["payload"]
#             for item in payload:
#                 author = item["author"]
#                 item['authorID'] = author['id']
#                 item["authorName"] = author["name"]
#                 item["authorHomePage"] = author["link"]
#                 item["authorRole"] = author["role"]
#                 item.pop('author', None)
#                 soup = BeautifulSoup(item["category"],"lxml")
#                 item["category"] = soup.string 
#                 payloads.append(item)
#     return payloads
# data1 = readOriData("0906")
# data1 = json.dumps(data1)
# df = pd.read_json(data1)
# df.to_csv("0906df.csv")  #根据获取数据的日期进行保存

2.1. 选择数据项¶

选择需要用于数据分析的特征(属性)

  • "id" 文章唯一编号
  • "date" 文章发布时间
  • "permalink" 文章链接,不要和id合并,路径中有文章分类,非纯编号
  • "title"
  • "category" 文章分类
  • "view" 浏览次数上万和十万之后,为了前端显示方便,并没有返回具体数值,仅有字符串【X万】,需要做舍入处理
  • "like"
  • "comment"
  • "bookmark"
  • "authorID" 用户唯一ID
  • "authorName"
  • "authorRole" 用户在站内身份
In [3]:
targetCols =["id","date","permalink","title","category","view",\
                 "like","comment","bookmark","authorID","authorName","authorRole"]

def readDataFromFile(dataFile, cols):
    temp = pd.read_csv(dataFile)
    temp = temp[cols]
    temp.set_index("id", inplace=True)
    return temp

df = readDataFromFile("0821.csv", targetCols)

df.head()
Out[3]:
date permalink title category view like comment bookmark authorID authorName authorRole
id
599 2012-05-17 http://www.woshipm.com/pd/599.html 企业门户设计的几点经验 产品设计 2079 0 1 3 1 老曹 站长
598 2012-05-17 http://www.woshipm.com/pd/598.html 网页设计师,请不要独自战斗 产品设计 1179 0 0 0 1 老曹 站长
597 2012-05-17 http://www.woshipm.com/pd/597.html 后移动时代的 Web 设计 产品设计 2435 0 0 0 1 老曹 站长
596 2012-05-17 http://www.woshipm.com/pd/596.html 产品经理,最是那自我阉割的惊艳 产品设计 3810 0 0 0 1 老曹 站长
595 2012-05-17 http://www.woshipm.com/pd/595.html 【连载一】twitter的新浪微博式创新系列之注册篇 产品设计 2146 0 0 1 1 老曹 站长

2.2. 数值选择与处理¶

2.2.1. 处理浏览次数为【万】结尾的¶

In [4]:
df.view.describe()
Out[4]:
count     45213
unique     8990
top        1.1万
freq       1963
Name: view, dtype: object
In [5]:
def getViewerValue(inputStr):
    try:
        if "m" in inputStr:
            return int(float(inputStr[:-1])*1000000)
        if "万" in inputStr:
            return int(float(inputStr[:-1])*10000)
        return int(inputStr)
    except:
        pass
#         print(inputStr)

df["view"] = df.view.apply(getViewerValue)

df["view"] = pd.to_numeric(df.view)

df.view.describe()
Out[5]:
count    4.521300e+04
mean     1.148534e+04
std      2.478602e+04
min      1.400000e+01
25%      4.141000e+03
50%      7.655000e+03
75%      1.200000e+04
max      2.500000e+06
Name: view, dtype: float64

2.2.2. 时间处理¶

通过空间换取效率,直接保存文章的发表年份发表年月、和发表天数,便于之后归类和数据筛选。

In [6]:
df.date = pd.to_datetime(df.date)

df["year"] = df.date.dt.strftime("%Y")

df["ymonth"] = df.date.dt.strftime("%Y-%m")

df["days"] = [x.days for x in (pd.datetime.strptime("2018-8-21","%Y-%m-%d") - df.date)]

2.3. 通用方法¶

In [7]:
def getLevel(inputView,highV,lowV):
    if inputView >= highV:
        return "High"
    if inputView <= lowV:
        return "Low"
    return "Mid"

3. 基本数据¶

3.1. 运营时长¶

In [8]:
# 上线天数

df.days.max()
Out[8]:
2287
In [9]:
# 运营年份

df.days.max()/365
Out[9]:
6.265753424657534

3.2. 文章数量¶

In [10]:
# 文章总数

artCnt = len(df)

artCnt
Out[10]:
45213
In [11]:
# 日均发布

artCnt / df.days.max()
Out[11]:
19.769567118495846
In [12]:
df.groupby("days").size().std(ddof=0)
Out[12]:
47.89091141098537
In [13]:
# 年均发布

len(df) / (df.days.max()/365)
Out[13]:
7215.891998250984
In [14]:
df.groupby("year").size().std(ddof=0)
Out[14]:
1590.5937795139812

3.3. 作者¶

In [15]:
# 作者数量

len(df.authorID.unique())
Out[15]:
4138
In [16]:
df.groupby("authorID").size().std(ddof=0)
Out[16]:
91.71881415404536
In [17]:
# 总阅读量

df.view.sum()
Out[17]:
519286607

3.4. 阅读量¶

In [18]:
# 平均每篇阅读量

df.view.sum() / len(df)
Out[18]:
11485.338442483357
In [19]:
df.view.std(ddof=1)
Out[19]:
24786.02120215524

3.5. 点赞、收藏和评论¶

In [20]:
df[["like","bookmark","comment"]].sum()
Out[20]:
like         642007
bookmark    2456296
comment      145744
dtype: int64
In [21]:
df[["like","bookmark","comment"]].sum()/artCnt
Out[21]:
like        14.199611
bookmark    54.327207
comment      3.223498
dtype: float64
In [22]:
df[["like","bookmark","comment"]].std(ddof=0)
Out[22]:
like         36.700739
bookmark    116.935689
comment      11.013832
dtype: float64
In [23]:
df.category = df.category.fillna("未分类")

# 文章分类

len(df.category.unique().tolist())
Out[23]:
23
In [24]:
df.category.unique().tolist()
Out[24]:
['产品设计',
 '业界动态',
 '招聘信息',
 '产品运营',
 '职场攻略',
 '产品经理',
 '分析评测',
 '创业学院',
 '交互体验',
 '数据分析',
 '用户研究',
 '营销推广',
 '讲座沙龙',
 '人人专栏',
 '区块链',
 '大咖分享',
 '原型设计',
 'AI人工智能',
 '干货下载',
 '文案策划',
 '未分类',
 '新零售',
 '大咖视频']
In [25]:
df.authorRole = df.authorRole.fillna("普通用户")

# 作者分类

len(df.authorRole.unique().tolist())
Out[25]:
23
In [26]:
df.authorRole.unique().tolist()
Out[26]:
['站长',
 '运营小编',
 '官方',
 '专栏作家',
 '运营小哥',
 '运营小妹',
 '普通用户',
 '设计小妹',
 '运营',
 '合作媒体',
 '编辑',
 '作者',
 '萌妹子',
 '主编',
 '女神',
 '男神2',
 '临时工',
 '特邀作者',
 '运营哥',
 '美少女',
 'CV工程师',
 '合作出版社',
 '小编妹妹']

4. 运营数据¶

4.1. 官方角色合并¶

重新分类

合并 原角色名称
站长 '站长',
平台运营 '运营小编','官方','运营小哥','运营小妹','设计小妹'
'运营','编辑','作者', '萌妹子','主编','女神','男神2','临时工', '运营哥', '美少女', 'CV工程师','小编妹妹'
普通用户 '普通用户',
高级用户 '专栏作家',
第三方合作 '合作媒体','特邀作者','合作出版社',
In [27]:
temp = df.copy()

temp.authorRole.unique().tolist()

temp.loc[temp.authorRole.isin(['运营小编','官方','运营小哥','运营小妹','设计小妹','运营','编辑','作者', '萌妹子','主编','女神','男神2','临时工', '运营哥', '美少女', 'CV工程师','小编妹妹']),"authorRole"] = "平台运营"

temp.loc[temp.authorRole.isin(['合作媒体','特邀作者','合作出版社']),"authorRole"] = "第三方合作"

roleDF = temp.copy()

roleDF.authorRole.unique().tolist()
Out[27]:
['站长', '平台运营', '专栏作家', '普通用户', '第三方合作']

4.2. 作者投稿¶

In [28]:
mArtDF = df.groupby("ymonth").size().copy()

mArtDF = pd.DataFrame(mArtDF, columns=["cnt"])

x = mArtDF.index.tolist()

y = mArtDF.cnt.tolist()

fig, ax1 = plt.subplots(1, 1, figsize=(30, 10), sharex=True)

g = sns.barplot(x=x, y=y, palette="rocket", ax=ax1)

g.axhline(0, color="k", clip_on=False)

g.set_ylabel("月投稿总数")

g.set_xticklabels(rotation=30,labels=x)

g.set_title("2012.5~2018.8 各月份投稿数量", fontsize=24)
Out[28]:
Text(0.5,1,'2012.5~2018.8 各月份投稿数量')

4.2.1. 各阶段的投稿主力是谁?¶

In [29]:
temp = roleDF.copy()

temp = temp.groupby([temp.year, temp.authorRole]).size()

temp = temp.unstack("authorRole").fillna(0)

temp = temp.T

temp["合计"] = temp.sum(axis=1)

temp = round(temp.sort_values("合计"),0)

temp.loc["合计"] =temp.sum()

temp.index.name = "作者角色"

temp.columns.name = "年度"

temp
Out[29]:
年度 2012 2013 2014 2015 2016 2017 2018 合计
作者角色
第三方合作 4.0 0.0 2.0 120.0 527.0 642.0 420.0 1715.0
站长 4185.0 545.0 152.0 26.0 0.0 0.0 0.0 4908.0
专栏作家 4.0 75.0 154.0 1295.0 2264.0 3378.0 1758.0 8928.0
普通用户 122.0 704.0 796.0 672.0 2377.0 3804.0 3071.0 11546.0
平台运营 204.0 2886.0 5608.0 4888.0 2080.0 1440.0 1010.0 18116.0
合计 4519.0 4210.0 6712.0 7001.0 7248.0 9264.0 6259.0 45213.0
In [30]:
f, ax = plt.subplots(figsize=(16, 9))

sns.heatmap(temp.drop("合计",axis=1), annot=True, linewidths=1, fmt='.0f', ax=ax, cmap="YlGnBu")

ax.set_title("2012~2018 投稿分布")

plt.yticks(rotation=0) 
Out[30]:
(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), <a list of 6 Text yticklabel objects>)

4.2.2. 投稿次数分布¶

In [31]:
# 投稿人数分布

temp = roleDF.sort_index().drop_duplicates(subset=["authorID","year"], keep="first")
In [32]:
temp = temp.groupby(["year","authorRole"]).size().unstack("year")
In [33]:
f, ax = plt.subplots(figsize=(16, 9))
sns.heatmap(temp, annot=True, linewidths=1, fmt='.0f', ax=ax, cmap="YlGnBu")
ax.set_title("2012~2018 投稿人次")
plt.yticks(rotation=0) 
Out[33]:
(array([0.5, 1.5, 2.5, 3.5, 4.5]), <a list of 5 Text yticklabel objects>)

4.2.3. 用户投稿次数分布¶

In [34]:
temp = roleDF[roleDF.authorRole.isin(["普通用户","专栏作家"])].copy()

len(temp)

maxShow = 50

temp = temp.groupby(temp.authorID).size()

values = pd.DataFrame(temp).groupby([0]).size()[:maxShow]

values["maxShow+"] = len(temp[temp>maxShow])

values.index
Out[34]:
Index([         1,          2,          3,          4,          5,          6,
                7,          8,          9,         10,         11,         12,
               13,         14,         15,         16,         17,         18,
               19,         20,         21,         22,         23,         24,
               25,         26,         27,         28,         29,         30,
               31,         32,         33,         34,         35,         36,
               37,         38,         39,         40,         41,         42,
               43,         44,         45,         46,         47,         48,
               49,         50, 'maxShow+'],
      dtype='object', name=0)
In [35]:
fig, ax1 = plt.subplots(1, 1, figsize=(16,9), sharex=True)

x1 = ([str(x) for x in np.arange(1,maxShow+1)])

x1.append(str(maxShow)+"+")
y1 = values.get_values()



g = sns.barplot(x=x1, y=y1,palette="GnBu_d", ax=ax1)
g.axhline(0, color="k", clip_on=False)
# g.set_xticklabels(rotation=30,labels=x)
g.set_title("用户投稿分布")

for i in range(len(x1)):
    g.text(i,y1[i]+20,str(y1[i]),size=12,ha="center")
In [36]:
sortedLevel = ["L1","L2","L6","L20"]

def groupAmount(inputA):
    if inputA > 21:
        return sortedLevel[3]
    if inputA > 5:
        return sortedLevel[2]
    if inputA == 1:
        return sortedLevel[0]
    return sortedLevel[1]
In [37]:
authLCntDF = roleDF[roleDF.authorRole.isin(["普通用户","专栏作家"])].copy()

authLCntDF = authLCntDF.groupby(["authorID","authorRole"]).size()

authLCntDF = pd.DataFrame(authLCntDF, columns=["amount"])

authLCntDF["level"] = authLCntDF.amount.apply(groupAmount)

authLCntDF = authLCntDF.reset_index("authorRole")

authLCntDF = authLCntDF.groupby([authLCntDF.authorRole, authLCntDF.level]).size()

authLCntDF =  authLCntDF.unstack("level")
In [38]:
authLCntDF = authLCntDF[sortedLevel]

authLCntDF
Out[38]:
level L1 L2 L6 L20
authorRole
专栏作家 8 31 221 124
普通用户 1703 1434 341 32
In [39]:
authLCntDF = authLCntDF.stack()
In [40]:
authLCntDF.index = authLCntDF.index.map("".join)

authLCntDF = pd.DataFrame(authLCntDF,columns = ["作者数量"])

authLCntDF
Out[40]:
作者数量
专栏作家L1 8
专栏作家L2 31
专栏作家L6 221
专栏作家L20 124
普通用户L1 1703
普通用户L2 1434
普通用户L6 341
普通用户L20 32
In [41]:
authLPubDF = roleDF[roleDF.authorRole.isin(["普通用户","专栏作家"])].copy()

authLPubDF = authLPubDF.groupby(["authorID","authorRole"]).size()

authLPubDF = pd.DataFrame(authLPubDF, columns=["amount"])

authLPubDF["level"] = authLPubDF.amount.apply(groupAmount)

authLPubDF = authLPubDF.reset_index("authorRole")

authLPubDF = authLPubDF.groupby([authLPubDF.authorRole, authLPubDF.level]).sum()

authLPubDF
Out[41]:
amount
authorRole level
专栏作家 L1 8
L2 115
L20 5962
L6 2843
普通用户 L1 1703
L2 4138
L20 2919
L6 2786
In [42]:
authLPubDF = authLPubDF.unstack("level")

authLPubDF = authLPubDF.amount

authLPubDF = authLPubDF[sortedLevel]

authLPubDF =pd.DataFrame(authLPubDF.stack(0))

authLPubDF.index = authLPubDF.index.map("".join)

authLPubDF.columns = ["投稿量"]
In [43]:
authLPubDF.index
Out[43]:
Index(['专栏作家L1', '专栏作家L2', '专栏作家L6', '专栏作家L20', '普通用户L1', '普通用户L2', '普通用户L6',
       '普通用户L20'],
      dtype='object')
In [44]:
authLCntDF.index
Out[44]:
Index(['专栏作家L1', '专栏作家L2', '专栏作家L6', '专栏作家L20', '普通用户L1', '普通用户L2', '普通用户L6',
       '普通用户L20'],
      dtype='object')
In [45]:
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(16, 9))

# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x=authLPubDF.index, y="投稿量", data=authLPubDF, label="投稿量", color="b")


# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x=authLCntDF.index, y="作者数量", data=authLCntDF, label="作者数量", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="upper right", frameon=True)
ax.set(ylabel="此等级用户投稿总量",xlabel="用户等级",ylim=(0,6000))
sns.despine(left=True, bottom=True)

4.2.4. 用户的投稿偏好¶

In [46]:
temp = roleDF.copy()

temp = temp[temp.date >= "2017-8-1"].copy()

temp = temp[temp.authorRole.isin(["普通用户","专栏作者"])]

temp = temp.groupby(["ymonth", "category"]).size().unstack(0).fillna(0)

orderIndex = temp.sum(axis=1).sort_values(ascending=False).index.tolist()

temp = temp.reindex(orderIndex)

temp
Out[46]:
ymonth 2017-08 2017-09 2017-10 2017-11 2017-12 2018-01 2018-02 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08
category
产品设计 90.0 70.0 68.0 86.0 79.0 96.0 48.0 92.0 90.0 72.0 85.0 95.0 70.0
产品运营 66.0 61.0 66.0 60.0 55.0 53.0 41.0 56.0 53.0 51.0 35.0 63.0 53.0
业界动态 52.0 45.0 47.0 56.0 44.0 57.0 20.0 48.0 47.0 41.0 52.0 76.0 67.0
产品经理 80.0 74.0 34.0 46.0 57.0 61.0 42.0 62.0 26.0 30.0 44.0 49.0 45.0
分析评测 37.0 40.0 31.0 23.0 9.0 15.0 13.0 24.0 40.0 31.0 32.0 32.0 29.0
交互体验 22.0 18.0 19.0 10.0 9.0 11.0 10.0 7.0 13.0 22.0 20.0 30.0 27.0
原型设计 19.0 10.0 18.0 13.0 3.0 11.0 7.0 5.0 0.0 7.0 10.0 30.0 35.0
职场攻略 17.0 13.0 7.0 4.0 7.0 6.0 5.0 18.0 26.0 11.0 14.0 17.0 11.0
用户研究 11.0 5.0 6.0 5.0 3.0 4.0 3.0 10.0 17.0 19.0 22.0 30.0 18.0
营销推广 0.0 0.0 0.0 0.0 0.0 1.0 5.0 11.0 22.0 28.0 37.0 25.0 23.0
创业学院 16.0 22.0 14.0 21.0 15.0 9.0 4.0 6.0 10.0 10.0 3.0 7.0 6.0
AI人工智能 0.0 1.0 3.0 3.0 1.0 15.0 3.0 17.0 17.0 22.0 24.0 11.0 11.0
数据分析 11.0 13.0 6.0 6.0 18.0 14.0 3.0 5.0 8.0 15.0 4.0 9.0 10.0
区块链 1.0 3.0 1.0 0.0 2.0 6.0 7.0 13.0 17.0 13.0 4.0 4.0 3.0
文案策划 0.0 0.0 0.0 0.0 0.0 4.0 2.0 3.0 6.0 12.0 8.0 5.0 2.0
讲座沙龙 0.0 0.0 3.0 2.0 1.0 5.0 1.0 5.0 1.0 3.0 7.0 4.0 3.0
大咖分享 8.0 8.0 4.0 2.0 2.0 1.0 1.0 1.0 0.0 0.0 0.0 4.0 1.0
新零售 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 2.0 8.0 2.0
未分类 4.0 2.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
In [47]:
f, ax = plt.subplots(figsize=(16, 9))

sns.heatmap(temp, annot=True, linewidths=1, fmt='.0f', ax=ax, cmap="YlGnBu")

ax.set_title("过去一年各分类投稿数量")

plt.yticks(rotation=0) 
Out[47]:
(array([ 0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5,
        11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5]),
 <a list of 19 Text yticklabel objects>)
In [48]:
yArtCntDF = roleDF.copy()

yArtCntDF = yArtCntDF[yArtCntDF.authorRole.isin(["普通用户","专栏作者"])]

yArtCntDF = yArtCntDF.groupby(["year", "category"]).size().unstack(0)

orderIndex = yArtCntDF.sum(axis=1).sort_values(ascending=False).index.tolist()

yArtCntDF = yArtCntDF.reindex(orderIndex)

yArtCntDF
Out[48]:
year 2012 2013 2014 2015 2016 2017 2018
category
产品设计 6.0 103.0 76.0 136.0 406.0 799.0 648.0
业界动态 101.0 326.0 382.0 104.0 337.0 480.0 408.0
产品运营 11.0 122.0 56.0 100.0 505.0 683.0 405.0
产品经理 3.0 44.0 188.0 130.0 476.0 676.0 359.0
分析评测 NaN NaN NaN 94.0 207.0 291.0 216.0
交互体验 NaN 46.0 21.0 29.0 111.0 171.0 140.0
职场攻略 NaN 6.0 21.0 28.0 82.0 122.0 108.0
原型设计 NaN 3.0 1.0 15.0 81.0 147.0 105.0
创业学院 NaN NaN NaN 16.0 89.0 164.0 55.0
数据分析 NaN NaN NaN NaN 32.0 123.0 68.0
用户研究 NaN NaN NaN NaN 28.0 70.0 123.0
营销推广 NaN NaN NaN NaN NaN NaN 152.0
AI人工智能 NaN NaN NaN NaN NaN 9.0 120.0
人人专栏 NaN 24.0 37.0 6.0 9.0 NaN NaN
区块链 NaN NaN NaN NaN NaN 8.0 67.0
大咖分享 NaN NaN NaN 2.0 5.0 35.0 8.0
讲座沙龙 NaN NaN 3.0 5.0 NaN 6.0 29.0
文案策划 NaN NaN NaN NaN NaN NaN 42.0
未分类 NaN 7.0 6.0 1.0 4.0 20.0 2.0
招聘信息 1.0 22.0 4.0 3.0 NaN NaN NaN
新零售 NaN NaN NaN NaN NaN NaN 16.0
干货下载 NaN 1.0 1.0 3.0 5.0 NaN NaN
In [49]:
f, ax = plt.subplots(figsize=(16, 9))

sns.heatmap(yArtCntDF, annot=True, linewidths=1, fmt='.0f', ax=ax, cmap="YlGnBu")

ax.set_title("2012~2018 用户投稿偏好分布")

plt.yticks(rotation=0) 
Out[49]:
(array([ 0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5,
        11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5]),
 <a list of 22 Text yticklabel objects>)

4.3. 文章收益¶

In [50]:
profitCols = ["view","like","bookmark","comment"]

4.3.1. 各阶段分类下文章对用户吸引力¶

In [51]:
# 各年分类下的阅读总量

yCateViewDF = df.copy()

yCateViewDF = yCateViewDF[profitCols+["year","category"]].groupby(["year","category"]).sum()["view"]

yCateViewDF = yCateViewDF.unstack("year").fillna(0)

yArtCntDF = roleDF.copy()

yArtCntDF = yArtCntDF.groupby(["year", "category"]).size().unstack(0).fillna(0)

yArtCntDF.head()

yArtViewAvgDF = round(yCateViewDF/yArtCntDF,0)

yArtViewAvgDF
Out[51]:
year 2012 2013 2014 2015 2016 2017 2018
category
AI人工智能 NaN NaN NaN NaN NaN 8141.0 3677.0
业界动态 3961.0 7112.0 7835.0 9713.0 10368.0 8037.0 5073.0
交互体验 5635.0 7929.0 14355.0 16659.0 14133.0 10529.0 6291.0
产品经理 20488.0 11772.0 14880.0 17319.0 17074.0 14475.0 11146.0
产品设计 4956.0 8201.0 14326.0 16867.0 15540.0 11685.0 6818.0
产品运营 4049.0 6220.0 10789.0 20756.0 15933.0 12274.0 9847.0
人人专栏 NaN 22737.0 24961.0 18075.0 16596.0 NaN 4868.0
分析评测 NaN NaN 44091.0 25084.0 28055.0 18822.0 10231.0
创业学院 NaN NaN 7903.0 11287.0 8874.0 6837.0 4604.0
区块链 NaN NaN NaN NaN 11440.0 7927.0 6718.0
原型设计 23000.0 85359.0 92041.0 95688.0 35725.0 25234.0 13360.0
大咖分享 NaN NaN NaN 23333.0 15440.0 6090.0 5622.0
大咖视频 NaN NaN NaN 10430.0 NaN NaN NaN
干货下载 67620.0 59967.0 26305.0 111722.0 114625.0 NaN NaN
招聘信息 8336.0 4630.0 4890.0 8558.0 18000.0 10990.0 17000.0
数据分析 NaN NaN NaN NaN 25812.0 20775.0 6870.0
文案策划 NaN NaN NaN NaN NaN NaN 6942.0
新零售 NaN NaN NaN NaN NaN NaN 3724.0
未分类 6889.0 7964.0 12894.0 12362.0 17280.0 9515.0 5708.0
用户研究 NaN NaN NaN NaN 16483.0 13948.0 5091.0
职场攻略 3652.0 7265.0 10987.0 16630.0 16515.0 11376.0 7334.0
营销推广 NaN NaN NaN NaN NaN NaN 6034.0
讲座沙龙 NaN 24000.0 19769.0 7341.0 16968.0 14920.0 10288.0
In [52]:
f, ax = plt.subplots(figsize=(16, 9))

sns.heatmap(yArtViewAvgDF, annot=True, linewidths=1, fmt='.0f', ax=ax, cmap="YlGnBu")

ax.set_title("2012~2018 分类文章阅读均值")

plt.yticks(rotation=0) 
Out[52]:
(array([ 0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5,
        11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5,
        22.5]), <a list of 23 Text yticklabel objects>)

4.3.2. 按收益分布重新为文章打分¶

In [53]:
def getHighRitio(colName, HighV, LowV):
    temp = df.copy()
    #Level评级
    Lname = colName + "L"
    #某一列的评级
    temp[Lname] = temp[colName].apply(getLevel, args=[HighV,LowV])
    temp = pd.DataFrame(temp.groupby(["ymonth",Lname]).size(), columns=["cnt"])
    #获取各月份下,高中低三档评级数量
    temp = temp.unstack(level=Lname)
    temp = temp.cnt
    temp = temp.fillna(0)
    temp["Total"] = temp.sum(axis=1)
    #计算高评级在本月中的比例
    HR = colName + "HR"
    temp[HR] = temp["High"]/temp["Total"]
    return temp

4.3.3. 投稿质量分布¶

In [54]:
mArtDF = df.groupby("ymonth").size().copy()

mArtDF = pd.DataFrame(mArtDF, columns=["cnt"])

x = mArtDF.index.tolist()

y = mArtDF.cnt.tolist()

mArtDF.cnt.max()

viewDF = getHighRitio("view",12000,4141)

likeDF = getHighRitio("like",15,2)

bookDF = getHighRitio("bookmark", 56, 1)

comDF = getHighRitio("comment", 3, 0)

VLBCdf = pd.concat([viewDF.viewHR, likeDF.likeHR,bookDF.bookmarkHR,comDF.commentHR], axis=1)

VLBCdf = VLBCdf*3000

VLBCAvgDF =  (viewDF.viewHR+likeDF.likeHR+bookDF.bookmarkHR+comDF.commentHR)/4

VLBCAvgDF = pd.DataFrame(VLBCAvgDF*3000)

VLBCAvgDF.head()
Out[54]:
0
ymonth
2012-05 39.209115
2012-06 179.347826
2012-07 182.741117
2012-08 99.650350
2012-09 91.216216
In [55]:
fig, ax1 = plt.subplots(1, 1, figsize=(30, 10), sharex=True)

g = sns.barplot(x=x, y=y, palette=sns.cubehelix_palette(12, start=.5, rot=-.75), ax=ax1)

g.axhline(0, color="k", clip_on=False)

g.set_ylabel("月投稿总数")

g.set_xticklabels(rotation=30,labels=x)

g.set_title("2012.5~2018.8 各月份投稿数量")

sns.lineplot(data=VLBCdf, palette="husl", linewidth=2.5, ax=ax1)
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f9b76a0>