阿里音乐流行趋势预测大赛与R语言实战_C教程

上一篇: nohup, &作用下一篇:Go基础学习六之并发concurrency

阿里音乐流行趋势预测大赛与R语言实战

发布时间：2019-08-06 发布网站：脚本宝典

脚本宝典收集整理的这篇文章主要介绍了阿里音乐流行趋势预测大赛与R语言实战，脚本宝典觉得挺不错的，现在分享给大家，也给大家做个参考。

概述
本次大赛以阿里音乐用户的历史播放数据为基础，期望参赛队伍通过对阿里音乐平台上每个时间段内艺人的试听量进行预测，挖掘出即将成为潮流的艺人，从而实现对一个时间段内音乐流行趋势的准确把控。大赛将开放一定规模的抽样歌曲艺人数据以及与这些艺人相关的用户行为，参赛队伍需要设计相应的算法进行数据分析和处理，比赛结果按照规定的评价指标使用在线评测程序进行评阅和排名，结果最优者获胜。
第一轮初赛将从5月17日开始到6月14日结束，中间数据会进行切换，届时本文将继续相关的比赛实战记录。
比赛地址
探索式分析

      
      
      PE="button" class="copyCode code-tool" data-toggle="tooltip" data-placement="top" data-clipboard-text="# 依赖加载
library(data.table)
library(tidyr)
library(DT)
library(ggplot2)
library(dplyr)

# 数据读取
mars_tianchi_songs = data.table::fread(";mars_tianchi_songs.csv");
colnames(mars_tianchi_songs) <- c("song_id","artist_id","publish_time","song_inIT_plays","language","gender")

mars_tianchi_user_actions = data.table::fread("mars_tianchi_user_actions.csv")
colnames(mars_tianchi_user_actions) <- c("user_id","song_id","gmt_create","action_type","ds")

# 数据聚合
setkey(mars_tianchi_songs,"song_id")
setkey(mars_tianchi_user_actions,"song_id")
total = mars_tianchi_songs[mars_tianchi_user_actions]

# 数据截取
a=total[,.(plays = round(mean(as.numeric(song_init_plays)))),by=list(artist_id,ds)]

# 在探索式分析中，我们定义每个歌手每日的rank值为歌手日歌曲初始热度的均值。
# 对每天每个歌手的初始活跃度求均值
result = a[,.(artist_id,plays,ds),]

# 转化为宽格式
resultDT  = tidyr::spread(result,key=artist_id,value=plays)
resultSubMatrix = resultDT[,-c("ds"),with=F]

# 归一化
weightSubMatrix=cbind(resultDT[,.(ds),],resultSubMatrix/apply(resultSubMatrix,1,sum,na.rm=T))

# 表格可视化
DT::datatable(weightSubMatrix)" title="" data-original-title="复制">
      
      
# 依赖加载
library(data.table)
library(tidyr)
library(DT)
library(ggplot2)
library(dplyr)

# 数据读取
mars_tianchi_songs = data.table::fread("mars_tianchi_songs.csv");
colnames(mars_tianchi_songs) <- c("song_id","artist_id","publish_time","song_init_plays","language","gender")

mars_tianchi_user_actions = data.table::fread("mars_tianchi_user_actions.csv")
colnames(mars_tianchi_user_actions) <- c("user_id","song_id","gmt_create","action_type","ds")

# 数据聚合
setkey(mars_tianchi_songs,"song_id")
setkey(mars_tianchi_user_actions,"song_id")
total = mars_tianchi_songs[mars_tianchi_user_actions]

# 数据截取
a=total[,.(plays = round(mean(as.numeric(song_init_plays)))),by=list(artist_id,ds)]

# 在探索式分析中，我们定义每个歌手每日的rank值为歌手日歌曲初始热度的均值。
# 对每天每个歌手的初始活跃度求均值
result = a[,.(artist_id,plays,ds),]

# 转化为宽格式
resultDT  = tidyr::sPRead(result,key=artist_id,value=plays)
resultSubMatrix = resultDT[,-c("ds"),with=F]

# 归一化
weightSubMatrix=cbind(resultDT[,.(ds),],resultSubMatrix/apply(resultSubMatrix,1,sum,na.rm=T))

# 表格可视化
DT::datatable(weightSubMatrix)


      
      
      ginal-title="复制">
      
      
# 转化为长格式
weightDT  = tidyr::gather(weightSubMatrix[,-1,with=F],key=artist_id,value=plays) %>% cbind(resultDT[,.(ds),])

# 热度可视化
ggplot(data=weightDT,aes(weightDT$ds,weightDT$plays,color=weightDT$artist_id))+geom_line()


本文将持续更新