Porn Data Anaylize — 上传者 分类信息分析(github)
”’
视频作者 视频分类信息分析
http://www.h4ck.org.cn
by obaby
obaby@mars
email:root@obaby.org.cn
date: 2020.09.04
”’
from pyspark.sql.functions import col
import altair as alt
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
csv = spark.read.option(“header”,True).csv(“hdfs://localhost:9000/data2/porn_data_movie.csv”)
csv.printSchema()
csv.select(‘name’, ‘describe’, ‘uploader_id’).show()
uploader_csv = spark.read.option(“header”,True).csv(“hdfs://localhost:9000/data2/porn_data_uploader.csv”)
uploader_csv.printSchema()
uploader_csv.select(‘name’, ‘id’, ‘nickname’, ‘porn_site_id’).show()
movie_csv = csv.withColumnRenamed(‘name’,’movie_name’)
movie_csv.show()
uploader_rdd =movie_csv.select(‘movie_name’,’uploader_id’).join(uploader_csv, movie_csv.uploader_id == uploader_csv.id, “inner”)
uploader_rdd.select(‘movie_name’,’uploader_id’,’name’,’nickname’, ‘porn_site_id’).show()
uc = uploader_rdd.select(‘name’)
upload_movie_count_rdd = uc.rdd.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
upload_movie_count_rdd.take(10)
tp = upload_movie_count_rdd.sortBy(lambda a: a[1],ascending=False).toDF().toPandas()
tp.head()
# 视频标签数量展示
alt.Chart(tp[:45]).mark_bar().encode(
x=alt.X(‘_1′, title=’作者名称’, sort=’-y’),
y=alt.Y(‘_2′, title=’视频数量’)
)
from pyspark.sql.functions import col, desc, lit
uploader_rdd.filter(col(‘name’) == ‘我想静静’).select(‘movie_name’, ‘create’).show()
no_uploader_count = csv.filter(‘uploader_id is null’).count()
total_count = csv.count()
print(‘没有上传作者信息行数:’, str(no_uploader_count))
print(‘总行数:’, str(total_count))
with_uploader_count = total_count – no_uploader_count
source = pd.DataFrame({
‘name’:[‘无作者’, ‘有作者’, ‘总数’],
‘count’:[no_uploader_count, with_uploader_count, total_count]
})
alt.Chart(source).mark_bar().encode(
x=’name’,
y=’count’
)
from matplotlib.font_manager import FontManager
fm = FontManager()
mat_fonts = set(f.name for f in fm.ttflist)
print(mat_fonts)
from matplotlib.font_manager import FontProperties
def getChineseFont():
return FontProperties(fname=’/System/Library/Fonts/PingFang.ttc’,size=15)
def create_pie_chart(font_name):
font = {‘family’ : font_name,
‘weight’ : ‘bold’,
‘size’ : 10}
plt.rc(“font”, **font)
#plt.rcParams[‘font.sans-serif’] = [‘Songti SC’]
labels = ‘无作者’, ‘有作者’
sizes = [no_uploader_count, with_uploader_count]
explode = (0, 0.1, 0, 0) # only “explode” the 2nd slice (i.e. ‘Hogs’)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct=’%1.1f%%’,
shadow=True, startangle=90)
ax1.axis(‘equal’) # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# 通过字体遍历,找个能用的字体
# for f in mat_fonts:
# print(f)
# create_pie_chart(f)
# Noto Serif CJK JP 可以显示中文字符~~
create_pie_chart(‘Noto Serif CJK JP’)
# 分类信息读取
category_csv = spark.read.option(“header”,True).csv(“hdfs://localhost:9000/data2/porn_data_category.csv”)
category_csv.printSchema()
movie_cat_rdd = movie_csv.select(‘movie_name’,’category_id’).join(category_csv, movie_csv.category_id == category_csv.id, “inner”)
movie_cat_rdd.select(‘movie_name’, ‘name’, ‘id’).show()
tr = movie_cat_rdd.select(‘name’)
movie_cat_count_rdd = tr.rdd.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
movie_cat_count_rdd.take(10)
cp = movie_cat_count_rdd.sortBy(lambda a: a[1],ascending=False).toDF().toPandas()
cp.head()
# 视频标签数量展示
bars=alt.Chart(cp).mark_bar().encode(
x=alt.X(‘_2′, title=’视频数量’),
y=alt.Y(‘_1′, title=’分类名称’, sort=’-x’)
)
text = bars.mark_text(align=’left’, baseline=’middle’, dx=3).encode(text=’_2′)
(bars + text).properties(height=1400, width=800)
bar2 =alt.Chart(cp).transform_joinaggregate(
TotalTime=’sum(_2)’,
).transform_calculate(
PercentOfTotal=”datum._2 / datum.TotalTime”,
poft = “datum._2 / datum.TotalTime *100”
).mark_bar().encode(
alt.X(‘PercentOfTotal:Q’, axis=alt.Axis(format=’.0%’)),
y=alt.Y(‘_1′, title=’分类名称’, sort=’-x’)
)
text2 = bar2.mark_text(align=’left’, baseline=’middle’, dx=3).encode(text=’poft:Q’)
(bar2 + text2).properties(height=1400, width=800)
cp
# tuple 拆分 https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe
cp = cp.assign(**dict(zip(‘ab’,cp._1.str)))
cp
fig2, ax2 = plt.subplots(figsize=(15,15))
ax2.pie(cp._2, labels=cp.a, autopct=’%1.1f%%’,
shadow=False, startangle=90)
ax2.axis(‘equal’) # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
复制并粘贴此 URL 进您的 WordPress 站点来嵌入
复制并粘贴此 URL 进您的站点来嵌入