Porn Data Anaylize — 视频数据初探

'''
--------------------------------------------------------------------------------
福利数据解析
基础数据分析,标题分词,词频统计
-----------------------------------
by:obaby
email: root@obaby.org.cn
blog:http://www.h4ck.org.cn
===================================
参考链接:https://sparkbyexamples.com/pyspark/select-columns-from-pyspark-dataframe/
-------------------------------------------------------------------------------
'''
import jieba
# 通过spark read csv格式文件,从csv header解析数据结构
csv = spark.read.option("header",True).csv("hdfs://localhost:9000/data2/porn_data_movie.csv")
# 数据格式
csv.printSchema()
root
 |-- id: string (nullable = true)
 |-- create: string (nullable = true)
 |-- update: string (nullable = true)
 |-- name: string (nullable = true)
 |-- describe: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- play_count: string (nullable = true)
 |-- good_count: string (nullable = true)
 |-- bad_count: string (nullable = true)
 |-- link_count: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- porn_site_id: string (nullable = true)
 |-- uploader_id: string (nullable = true)
 |-- producer: string (nullable = true)

csv.show()
+---+--------------------+--------------------+------------------------+------------------------+---------+------------+----------+----------+---------+----------+-------------+-----------+-----------+------------+-----------+--------+
| id|              create|              update|                    name|                describe|source_id|publish_time|play_count|good_count|bad_count|link_count|comment_count|designation|category_id|porn_site_id|uploader_id|producer|
+---+--------------------+--------------------+------------------------+------------------------+---------+------------+----------+----------+---------+----------+-------------+-----------+-----------+------------+-----------+--------+
|  1|7/5/2020 05:20:14...|7/5/2020 05:20:14...|    美女学生考试时被中出|                    null|    76009|  1588819417|      2566|        14|        2|        48|            0|       null|          1|           1|          1|    null|
|  2|7/5/2020 05:21:04...|7/5/2020 05:21:04...|      无套中出内射(上)|            无套中出内射|    76021|  1588757668|     11199|        58|        3|       338|            0|       null|          1|           1|          2|    null|
|  3|7/5/2020 05:21:06...|7/5/2020 05:21:06...|      无套中出内射(下)|            无套中出内射|    76028|  1588757665|      7067|        39|        3|       310|            0|       null|          1|           1|          2|    null|
|  4|7/5/2020 05:21:12...|7/5/2020 05:21:13...|                极品嫩妹|                    null|    76053|  1588756935|      3372|        18|        2|       159|            0|       null|          1|           1|          3|    null|
|  5|7/5/2020 05:21:18...|7/5/2020 05:21:19...|                骚妹妹 7|                    null|    76051|  1588756876|      1975|        10|        0|        53|            0|       null|          1|           1|          3|    null|
|  6|7/5/2020 05:22:13...|7/5/2020 05:22:13...|漂亮美女完美身材甜美声音|漂亮美女完美身材甜美声音|    75971|  1588744135|     17097|        31|        3|       222|            0|       null|          1|           1|          4|    null|
|  7|7/5/2020 05:22:19...|7/5/2020 05:22:19...|          jk捆绑萝莉束缚|                    null|    75342|  1588739354|      8362|        29|        6|       201|            0|       null|          1|           1|          1|    null|
|  8|7/5/2020 05:22:23...|7/5/2020 05:22:23...|            最纯萝莉性爱|                    null|    75112|  1588739158|     10329|        61|        5|       434|            0|       null|          1|           1|          1|    null|
|  9|7/5/2020 05:22:30...|7/5/2020 05:22:31...|          整容脸制服美女|                    null|    75104|  1588739080|      5297|        26|        2|       221|            0|       null|          1|           1|          1|    null|
| 10|7/5/2020 05:22:38...|7/5/2020 05:22:38...|             星 调教萝莉|                    null|    75563|  1588738220|      4725|        31|        5|       166|            0|       null|          1|           1|          5|    null|
| 11|7/5/2020 05:22:44...|7/5/2020 05:22:45...|        大一学生寝室自慰|                    null|    75601|  1588686744|     20318|       152|        4|       773|            0|       null|          1|           1|          6|    null|
| 12|7/5/2020 05:22:47...|7/5/2020 05:22:48...|    超性感美女给你打飞机|                    null|    75603|  1588686520|     17302|        54|        7|       333|            0|       null|          1|           1|          6|    null|
| 13|7/5/2020 05:22:53...|7/5/2020 05:22:53...|      超可爱萝莉下海资源|                    null|    75747|  1588686493|     18133|        98|       10|       497|            0|       null|          1|           1|          6|    null|
| 14|7/5/2020 05:22:56...|7/5/2020 05:22:57...|      迷倒女儿然后慢慢操|                    null|    75611|  1588686476|     59674|       182|       25|       793|            0|       null|          1|           1|          6|    null|
| 15|7/5/2020 05:22:59...|7/5/2020 05:23:00...|   精灵做爱高潮–影视剪辑|                    null|    75543|  1588686387|      8767|       100|        7|       152|            0|       null|          1|           1|          7|    null|
| 16|7/5/2020 05:23:16...|7/5/2020 05:23:16...|        小萝莉被插到大叫|                    null|    75307|  1588670775|     14213|        71|        8|       360|            0|       null|          1|           1|          6|    null|
| 17|7/5/2020 05:23:21...|7/5/2020 05:23:22...|    睡醒和白嫩女友来一发|                    null|    75318|  1588668616|      5350|        30|        3|       185|            0|       null|          1|           1|          6|    null|
| 18|7/5/2020 05:23:26...|7/5/2020 05:23:27...|  双马尾萝莉甜美自慰诱惑|                    null|    75382|  1588652017|     25311|       190|       10|       890|            0|       null|          1|           1|          8|    null|
| 19|7/5/2020 05:23:28...|7/5/2020 05:23:29...|        调教双马尾小萝莉|                    null|    74932|  1588644358|     29114|       180|       19|       783|            0|       null|          1|           1|          1|    null|
| 20|7/5/2020 05:23:35...|7/5/2020 05:23:36...|    黑丝小萝莉最喜欢玩具|                    null|    74812|  1588585581|     17084|       141|       10|       883|            0|       null|          1|           1|          9|    null|
+---+--------------------+--------------------+------------------------+------------------------+---------+------------+----------+----------+---------+----------+-------------+-----------+-----------+------------+-----------+--------+
only showing top 20 rows
# csv.select('_c3','_c4').show()
#df = csv.withColumnRenamed('_c3','name').withColumnRenamed('_c4', 'describe')
#df.printSchema()
csv.select('name', 'describe').show()
+------------------------+------------------------+
|                    name|                describe|
+------------------------+------------------------+
|    美女学生考试时被中出|                    null|
|      无套中出内射(上)|            无套中出内射|
|      无套中出内射(下)|            无套中出内射|
|                极品嫩妹|                    null|
|                骚妹妹 7|                    null|
|漂亮美女完美身材甜美声音|漂亮美女完美身材甜美声音|
|          jk捆绑萝莉束缚|                    null|
|            最纯萝莉性爱|                    null|
|          整容脸制服美女|                    null|
|             星 调教萝莉|                    null|
|        大一学生寝室自慰|                    null|
|    超性感美女给你打飞机|                    null|
|      超可爱萝莉下海资源|                    null|
|      迷倒女儿然后慢慢操|                    null|
|   精灵做爱高潮–影视剪辑|                    null|
|        小萝莉被插到大叫|                    null|
|    睡醒和白嫩女友来一发|                    null|
|  双马尾萝莉甜美自慰诱惑|                    null|
|        调教双马尾小萝莉|                    null|
|    黑丝小萝莉最喜欢玩具|                    null|
+------------------------+------------------------+
only showing top 20 rows
#csv.count()
from pyspark.sql.functions import col
import altair as alt

import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

no_describe_count = csv.filter('describe is null').select('describe').count()
no_name_count = csv.filter('name is null').select('name').count()
total_count = csv.count()
print('没有描述信息行数:', str(no_describe_count))
print('没有标题行数:', str(no_name_count))
print('总行数:', str(total_count))
没有8%8信息行数: 96464
没有标题行数: 10997
总行数: 155856
#csv.filter(col("describe") == '').count()
source = pd.DataFrame({
    'name':['描述为空', '标题为空', '总数'],
    'count':[no_describe_count, no_name_count, total_count]
})
alt.Chart(source).mark_bar().encode(
    x='name',
    y='count'
)
 

整体来说视频没有标题的数量较少,但是大多数的视频都没有描述信息。

# !pip3 install altair -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip3 install matplotlib -i https://pypi.tuna.tsinghua.edu.cn/simple
csv.rdd.first()

Row(id='1', create='7/5/2020 05:20:14.134443', update='7/5/2020 05:20:14.768052', name='美女学生考试时被中出', describe=None, source_id='76009', publish_time='1588819417', play_count='2566', good_count='14', bad_count='2', link_count='48', comment_count='0', designation=None, category_id='1', porn_site_id='1', uploader_id='1', producer=None)
pdf = csv.toPandas()
pdf.head()
  id create update name describe source_id publish_time play_count good_count bad_count link_count comment_count designation category_id porn_site_id uploader_id producer
0 1 7/5/2020 05:20:14.134443 7/5/2020 05:20:14.768052 美女学生考试时被中出 None 76009 1588819417 2566 14 2 48 0 None 1 1 1 None
1 2 7/5/2020 05:21:04.500403 7/5/2020 05:21:04.994096 无套中出内射(上) 无套中出内射 76021 1588757668 11199 58 3 338 0 None 1 1 2 None
2 3 7/5/2020 05:21:06.562127 7/5/2020 05:21:06.916907 无套中出内射(下) 无套中出内射 76028 1588757665 7067 39 3 310 0 None 1 1 2 None
3 4 7/5/2020 05:21:12.77195 7/5/2020 05:21:13.217674 极品嫩妹 None 76053 1588756935 3372 18 2 159 0 None 1 1 3 None
4 5 7/5/2020 05:21:18.718269 7/5/2020 05:21:19.039071 骚妹妹 7 None 76051 1588756876 1975 10 0 53 0 None 1 1 3 None
# pdf['playc_count'] = pd.to_numeric(pdf['play_count'], errors='ignore')

# pdf['play_count_int'] = pdf['play_count'].astype(int)
pdf['playc_count'] = pd.to_numeric(pdf['play_count'], errors='coerce')
pdf['playc_count'] = pdf['playc_count'].fillna(0)
pdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155856 entries, 0 to 155855
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             155855 non-null  object 
 1   create         154126 non-null  object 
 2   update         154316 non-null  object 
 3   name           144859 non-null  object 
 4   describe       59392 non-null   object 
 5   source_id      151995 non-null  object 
 6   publish_time   151994 non-null  object 
 7   play_count     151185 non-null  object 
 8   good_count     150036 non-null  object 
 9   bad_count      151081 non-null  object 
 10  link_count     151110 non-null  object 
 11  comment_count  150634 non-null  object 
 12  designation    36394 non-null   object 
 13  category_id    35416 non-null   object 
 14  porn_site_id   149667 non-null  object 
 15  uploader_id    70964 non-null   object 
 16  producer       3 non-null       object 
 17  playc_count    155856 non-null  float64
dtypes: float64(1), object(17)
memory usage: 21.4+ MB
# 总播放次数,数据中的播放次数并没有进行实时更新,所以数值偏小
# 另外部分网站没有返回播放次数,所以一定程度上也减小了这个实际的总次数,即使如此,播放次数还是比较大的
print('总播放次数:',pdf['playc_count'].sum())
总播放次数: 6277445731.0
print('平均播放次数:',pdf['playc_count'].mean())
平均播放次数: 40277.21570552305
print('最高播放次数:', pdf['playc_count'].max())
print('最低播放次数:', pdf['playc_count'].min()) # 部分网站没有返回播放次数信息,爬虫无法获取播放次数
最高播放次数: 12777854.0
最低播放次数: 0.0

# play_count_df = pd.cut(pdf['playc_count'], 
#                        bins=[0, 100, 200, 300, 400, 500,600,700,800,900, 1000,1100, 12000],
#                       right=False,
#                       labels=['0', '1-99', '100-199', '200-299', '300-399', '400-499', '500-599', '600-699', '700-799', '800-899', '900-999', '>1K']
#                       )
# play_count_df
quartiles = pd.cut(pdf.playc_count, 30)
quartiles.head()

# 定义聚合函数
def get_stats(group):
    return {'playc_count': group.sum()}

# 分组统计
grouped = pdf.playc_count.groupby(quartiles)
price_bucket_amount = grouped.apply(get_stats).unstack()
price_bucket_amount.head()
  playc_count
playc_count  
(-12777.854, 425928.467] 4.329000e+09
(425928.467, 851856.933] 9.426015e+08
(851856.933, 1277785.4] 3.389843e+08
(1277785.4, 1703713.867] 1.600206e+08
(1703713.867, 2129642.333] 1.297862e+08
#from altair import *
#Chart(pdf[:4999]).mark_bar().encode(
#    x=X('play_count:Q'),
#    y='count(*):Q')
name_rdd = csv.select('name').rdd
name_rdd.count()
152841

def splitCH(line):
    is_contain_one_word=False
    words = jieba.cut(str(line))
    l =[]
    if not is_contain_one_word:
        for w in words:
            if len(w)>1:
                l.append(str(w))
        return l
    return list(words)
    

name_rdd.first()
Row(name='美女学生考试时被中出')
# name_word_rdd = name_rdd.flatMap(lambda name: str(name).split(' '))
name_word_rdd = name_rdd.flatMap(splitCH)
name_word_rdd.count()
1006110
name_word_rdd.first()
'Row'
name_word_rdd.take(20)
['Row',
 'name',
 '美女',
 '学生',
 '考试',
 '中出',
 'Row',
 'name',
 '无套',
 '中出',
 '内射',
 'Row',
 'name',
 '无套',
 '中出',
 '内射',
 'Row',
 'name',
 '极品',
 '嫩妹']
# map reduce 进行词频统计
name_counts_rdd = name_word_rdd.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)

print('单词统计一共:', str(name_counts_rdd.count()))
单词统计一共: 60585
name_counts_rdd.saveAsTextFile('hdfs://localhost:9000/data2/name_counts.txt')
name_counts_rdd.sortBy(lambda a: a[1],ascending=False).collect()[:20]
[('name', 155856),
 ('Row', 155856),
 ('None', 10997),
 ('中文字幕', 8903),
 ('美女', 6111),
 ('巨乳', 4368),
 ('性感', 4222),
 ('自慰', 4201),
 ('啪啪', 3983),
 ('诱惑', 3279),
 ('人妻', 3239),
 ('极品', 3141),
 ('妹妹', 3131),
 ('女友', 2768),
 ('性爱', 2758),
 ('少妇', 2752),
 ('美少女', 2668),
 ('喜欢', 2654),
 ('高潮', 2624),
 ('酒店', 2392)]

通过上面的统计其实可以看出,鉴于很多网站还有很多来源于日本的视频,所以中文字幕的词频依然非常高。除此之外还可以看到一些常用的描述性词汇。

name_counts_df = name_counts_rdd.toDF().toPandas()

name_counts_df.head()
  _1 _2
0 name 155856
1 美女 6111
2 无套 1323
3 内射 1234
4 极品 3141
# 部分单词词频展示
alt.Chart(name_counts_df[3:40]).mark_bar().encode(
    x='_1',
    y='_2'
)
 


代码参考notebooks/data_anaylize.ipynb
数据参考data目录下相关文件

github: https://github.com/obaby/Porn-Data-Anaylize


分享文章:

猜你喜欢:

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注