声優出演情報と売上情報を取ってくる。
x <- seq(1, 5030)
id <- paste(sapply(mapply(rep, 0, 4-nchar(x)), paste, collapse=""), x, sep="")
http <- paste("http://lain.gr.jp/mediadb/media/", id, sep="")
write.table(http, "list.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)
wget -i list.txt -w 0.2 -nc
import os
import re
import time
from progressbar import *
wd = "/cv/"
files = os.listdir(wd)
title = " - アニメデータベース"
cast = "キャスト"
media = "<dt>メディア</dt>"
cvre = re.compile('<a href="/voicedb/profile/.*">.*</a>')
widgets = ["progress:", Percentage(), Bar()]
maxval = len(files)
pbar = ProgressBar(maxval=maxval, widgets=widgets).start()
w0 = open("cvkyoen.txt", "w")
w0.write("\t".join(["title", "media", "cv"]) + "\n")
for f in range(len(files)):
pbar.update(pbar.currval + 1)
g = open(wd + files[f], "rU")
cvlist = []
for i in range(1000):
tmp = g.readline()
if title in tmp:
res1 = tmp.strip().split(title)[0].split("<title>")[-1]
elif media in tmp:
res2 = g.readline().strip()[4:-5]
elif len(cvre.findall(tmp)) > 0:
cvlist += [cvre.findall(tmp)[0].split("</a>")[0].split(">")[-1]]
w0.write(res1 + "\t" + res2 + "\t" + " ".join(cvlist) + "\n")
w0.close()
parser <- function(v){
res <- vector("list", length(v))
for(i in length(v)){
foo0 <- v[[i]]
foo1 <- lapply(strsplit(foo0, "<br>"), strsplit, " ")[[1]]
foo2 <- foo1[sapply(foo1, length)>1]
foo4 <- matrix(0, length(foo2), 2)
for(j in seq(nrow(foo4))){
tmp <- strsplit(foo2[[j]][1], " ")[[1]][1]
tmp <- gsub("[*]", "", tmp)
tmp <- gsub(",", "", tmp)
tmp <- gsub("-", "", tmp)
foo4[j,2] <- tmp
tmp <- paste(tail(foo2[[j]], -1), collapse="")
tmp <- gsub("amp;", "", tmp)
tmp <- gsub("<", "<", tmp)
tmp <- gsub(">", ">", tmp)
foo4[j,1] <- tmp
}
res[[i]] <- foo4
}
return(res)
}
wd <- "/animesale/"
files <- list.files(wd, pattern=".html")
flag <- "放送開始TVアニメ"
res.final <- NULL
years <- NULL
seasons <- NULL
for(f in seq(files)){
hoge <- c(as.matrix(read.delim(files[f], header=FALSE)))
hoge1 <- which(sapply(sapply(mapply(grep, flag, hoge), is.na), length) == 1)
hoge2 <- hoge[hoge1][rank(hoge[hoge1]) < 4]
res <- mapply(parser, hoge2)
res1 <- cbind(res[[1]][,1:2], res[[2]][,2], res[[3]][,2])
tmp <- strsplit(files[f], " ")[[1]][4]
year <- strsplit(tmp, "年")[[1]][1]
season <- strsplit(strsplit(tmp, "年")[[1]][2], "期.html")[[1]]
res.final <- rbind(res.final, cbind(res1, year, season))
years <- c(years, year)
seasons <- c(seasons, season)
}
colnames(res.final)[1:3] <- c("sum", "sum0", "first")
write.table(res.final, "sales.txt", row.names=FALSE, quote=FALSE, sep="\t")