Twitterの投稿時間分布から考える声優のあれこれ - 驚異のアニヲタ社会復帰の予備

MikuHatsune2014-10-26

これとこれ(取り下げになってるけど)に触発されて次の声優統計ネタにしようと思ってやってみる。
Twitterの投稿時間分布が似ていれば、生活習慣が似ているだろうし、アフレコとかラジオとか仕事がちょっとはかぶっているだろうし、男と女の関係を邪推したりといろいろ捗る!!
というわけで、投稿時間の分布の比較にはKullback–Leibler divergenceを使おう。KLDはentropyパッケージで計算できる。
定義としては離散的な確率分布 $P$ , $Q$ に対して、
$D_{KL}(P||Q)=\sum_{i}P(i)log\frac{P(i)}{Q(i)}$
となる。 $P$ と $Q$ が等しければ0となる。また、 $D_{KL}(P||Q)>0$ である。
$P(i)=0$ のとき、 $xlogx$ がうまく計算されないので、便宜上、 $P(i)=0$ のとき適当にものすごく小さな値を入れている。
　
Rでの読み込み時にtweet本文の文字列があるとめんどうなことが起きることが多いので、投稿時間だけにしておく。
データのとり方はこれとこれとこれを参考にしている。

# 投稿時間だけ抜き出す
import os
import re
wd = "/tweet/"
files = os.listdir(wd)

w0 = open("tweet_time.tsv", "w")
for i in range(len(files)):
	g = open(wd + files[i], "rU")
	cv = re.sub(".tsv", "", files[i])
	for l in g:
		s = l.split("\t")
		if len(s) >= 4:
			t = l.split("\t")[2]
			w0.write("\t".join([cv, t]) + "\n")

w0.close()

というわけでRでひたすら処理をしていこう。

# これをしておかないと時間処理がうまくいかない。
lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C")
x <- c("1jan1960", "2jan1960", "31mar1960", "30jul1960")
z <- as.Date(x, "%d%b%Y")

# Rで投稿時間分布をKLで
library(entropy)
wd <- "/tweet/"
tw <- list.files(wd)
cv <- read.csv("twitter_cv_list.txt", header=FALSE)
dat <- read.delim("tweet_time.tsv", header=FALSE)
dat2 <-  strsplit(as.matrix(dat$V2), " ")
# クッソ時間かかるよ
z <- mapply(function(x) strptime(paste(x[c(3, 2, 6, 4)], collapse=""), "%d%b%Y%H:%M:%S", "JST")+ 9*60*60, dat2, SIMPLIFY=FALSE)

# 投稿時間への変換
tweet_dist <- vector("list", nrow(cv))
names(tweet_dist) <- cv$V2
mat <- diag(0, nrow(cv))
rownames(mat) <- colnames(mat) <- cv$V2

pb <- txtProgressBar(max=nrow(cv), style=3)
for(i in seq(nrow(cv))){
	setTxtProgressBar(pb, i)
	idx <- which(dat$V1 == as.character(cv$V1[i]))
	h <- as.integer(mapply(function(x) format(x, "%H"), z[idx]))
	tweet_dist[[i]] <- h
}

# 全声優についてとりあえずプロットしてみる。
# 例として17歳声優についてプロットしてみる。
td <- mapply(function(x) table(factor(x, 0:23)), tweet_dist)
td <- sweep(td, 2, colSums(td), "/")
idx <- which(colnames(td) == "田村ゆかり")

matplot(0:23, td, type="l", col=grey(0.8), lty=1, xaxt="n", xlab="投稿時間", ylab="分布割合")
axis(1, at=0:23, labels=0:23)
lines(0:23, td[, idx], col="red", lwd=3)

たいていは夜にtweetしてそうだけど、17歳の声優は深夜も夜明け前も関係なくつぶやいているっぽい。
朝8時にはたぶんおはようが多いんだろう。

　
300人以上の声優について、KLDをひたすら計算して、分布が似ていそうな人をクラスタリングしてみよう。

pb <- txtProgressBar(max=(nrow(cv)^2-nrow(cv))/2, style=3)
counter <- 1
# KLDの計算
for(i in 1 : (nrow(mat) - 1)){
	t_i <- table(factor(tweet_dist[[i]], 0:23))
	t_i <- t_i / sum(t_i)
	t_i <- ifelse(t_i != 0, t_i, 10e-10)
	for(j in (i + 1) : ncol(mat)){
		t_j <- table(factor(tweet_dist[[j]], 0:23))
		t_j <- t_j / sum(t_j)
		t_j <- ifelse(t_j != 0, t_j, 10e-10)
		kld <- try(KL.empirical(t_i, t_j), silent=TRUE)
		mat[i, j] <- ifelse(class(kld) == "try-error", 0, kld)
		setTxtProgressBar(pb, counter)
		counter <- counter + 1
	}
}

na <- na.omit(mat)
mat1 <- na[, -attributes(na)$na.action]
mat1 <- mat1 + t(mat1)

library(gplots)
library(ape)
# heatmap はうまく行かなかったので無視。
h <- heatmap.2(mat1, scale="col", trace="none", dendrogram="row", key=FALSE, cexCol=10e-10, lhei=c(0.1, 2), lwid=c(0.3, 1), margin=c(1, 5),
hclustfun=function(b){hclust(b, method="ward")})

hcl <- hclust(as.dist(mat1), method="ward")
ph <- as.phylo(hcl) # phylo オブジェクトの方が使いやすい?

idx <- which(ph$tip.label == "田村ゆかり")
cex <- replace(rep(0.1, length(ph$tip.label)), idx, 1.2)
col <- replace(rep("white", length(ph$tip.label)), idx, "red")
par(mar=c(0, 0, 0, 0))
plot(ph, cex=cex, tip.color=col)
plot(ph, cex=cex, tip.color=col, type="unrooted")

とりあえず9つくらいのクラスタができそうな気配がある。あとはkmeansとか、デンドログラムの高さでcutしていろいろ見たい。