（サッカー解説）2点差は危険なスコアですね ← ???

MikuHatsune2018-01-17

高校サッカーを見ていた。2017年度は前橋育英が初優勝で幕を閉じた。
どの試合だったか忘れてしまったが、2点差がついたときに解説が「2点差は危険」ということを言っていた。
　
調べてみると、やはりよく言われていることのようだが、実際にデータをとってみると、プレミアリーグでは2点差をひっくり返して勝つ確率は1.71%、Jリーグでは2点差からドローが5%、2点差から敗北5%だったらしい。
　
自分はユース年代のファンなので、せっかくJFA が公式に試合記録を出してくれるということもあって、冬の高校選手権の得点時間を抽出して、2点差が危険なのかどうかを解析したい。
　
JFA から公式記録PDF を取得するが、2009年(88回大会)から2017年(96回大会)まで存在していて、各大会47試合ある。ただし、2009年はPDF の都合でデータをパースできなかったので全部で379試合が対象である。
試合記録から、得点時間と得点チームを抽出する。というのも、得点時間の分布と、ある時刻でのリードが最終的に勝率にどのくらい影響するかをみたいからである。
ただし、冬の選手権は決勝までは80分+即PK戦、決勝は90分+15分ハーフの延長戦なので、いい感じに分類したかったが、90分のときのロスタイムでうまく処理できてない気もするが無視。
また、2点差が危険の定義だが、N点差がついていたにもかかわらず、追いつかれて点差が0 になったときとする。というのも、追いつかれたとしてもトーナメント方式なのでPK戦までして勝敗を決めるが、得点時間を取得した都合で、80分内での点差の推移を考えることにする。なので、80分時点での点差が勝ち、引き分け、負けになる。
90分の試合を補正して80分にしてもよかったが、していない。ロスタイムの得点で最大80+6分というのがある。
　
結論から言うと、2点差がつくとたいてい90% 以上の確率でリードを保ったまま試合を終えることができる。
ある試合時刻 $t$ のとき、2点差がついている試合数 $N_{t,2}$ があり、残りの $80-t$ 時間、リードを保ったまま(点差が0 より大)試合を終えた試合数 $N_{t,2}^{'}$ を単純に数えて割ったものが各点であり、指数関数モデルで曲線フィッティングしたのが黒線である。
試合経過時間が短いときに得点しても、追いつくチャンスがまだまだあるので、リードを保ち続けられる確率は低めだが、時間が経てばそのチャンスが減っていくので、リードを保てる確率は増える。
1点差のリードのとき、80-85分のあたりで確率が低くなっており、このせいで推定曲線が全体的に下がったものが推定されているが、ここを除外して推定曲線を引くと、80分時点で1点リードしていればほぼ勝率が100% になる。

　
前半、後半について得点時間の分布を見てみると、特にどの時間帯でピークがある、という感じではなく、一様分布のようである。なのでどの時間帯にトイレにいっても得点シーンを見逃す確率は同じ程度と思われる。

　
一方で、得点が入ってから次の得点がはいるまでの時間の分布は、一様分布ではない。横軸が経過時間、縦軸が確率密度とすれば、ガンマ分布のようである。また、1試合あたりの得点数も、ポアソン分布っぽい。
というわけで、次の得点までの時間分布をガンマ分布で、1試合あたりの両チーム合計の得点数をポアソン分布でモデル化してみると、こんな感じになる。
ガンマ分布のパラメータは(1.33, 0.06) で、最頻5分、中央16分程度、平均20分で点がはいる。
ポアソン分布のパラメータは2.9であり、1試合あたり3点くらいの点がはいる。

　
というわけで、「2点差は危険」というのは、1回のゴールで1点しか点が動かないサッカーのルールの特性上、2点差から追いつくもしくは逆転されるという試合が伝説のように語られる思い出バイアスなので、解説の人は勉強してください()
　
解析

### analysis
library(stringr)
dat <- read.table("soccer.txt", header=TRUE, stringsAsFactors=FALSE)

tmp <- split(dat[,-1], dat$year)
tmp <- mapply(function(z) split(z[,-1], z$no), tmp)

res <- vector("list", length(tmp))
k <- c(1, -1)
for(i in seq(tmp)){
  out <- vector("list", length(tmp[[i]]))
  for(j in seq(tmp[[i]])){
    hoge <- tmp[[i]][[j]]
    foo <- lapply(str_extract_all(hoge$time, "\\d+"), as.numeric)
    time <- mapply(function(z) append(z, rep(0, 2-length(z))), foo)
    team <- k[c(factor(hoge$team, levels=unique(hoge$team)))]
    bar <- rbind(time, team)
    rownames(bar) <- c("time", "extra", "team")
    out[[j]] <- bar
  }
  res[[i]] <- out
}


y0 <- lapply(res, do.call, what=cbind)
y1 <- do.call(cbind, y0)
half <- c("前半", "後半")
idx <- list(y1[1,] <= 40, y1[1,] >  40)
goaltime <- mapply(function(z) colSums(y1[1:2, z]), idx)
yl <- c(0, 4)
par(mfcol=c(2, 2), las=1)
for(i in seq(goaltime)){
  pl1 <- table(goaltime[[i]])/sum(unlist(idx), na.rm=TRUE)
  pl2 <- table(goaltime[[i]])/length(goaltime[[i]])
  barplot(pl1*100, xlab="経過時間", ylab="percentage", ylim=yl, col=i+1, main=sprintf("%sの得点時間分布 (%s時間)", half[i], "全試合"))
  barplot(pl2*100, xlab="経過時間", ylab="percentage", ylim=yl, col=i+1, main=sprintf("%sの得点時間分布 (%s時間)", half[i], half[i]))
}


score <- vector("list", sum(sapply(tmp, length)))
k <- 1
for(i in seq(tmp)){
  for(j in seq(tmp[[i]])){
    hoge <- res[[i]][[j]]
    score[[k]] <- cbind(c(0, colSums(hoge[1:2,,drop=FALSE])), cumsum(c(0, hoge[3,,drop=FALSE])))
    k <- k + 1
  }
}

mat <- NULL
for(i in seq(res)){
  for(j in seq(res[[i]])){
    k <- res[[i]][[j]]
    if(!is.na(k[1])){
      l <- append(rep(NA, max(k[1,])), rep(NA, tail(k[2,], 1)+1))
      elp <- c(1, colSums(k[1:2,,drop=FALSE])+1) # 得点時間のindex
      elp <- append(elp, ifelse(max(k[1,])<=80, max(80, elp), ifelse(max(k[1,])<=90, max(90, elp), 120)))
      s <- c(0, cumsum(k[3,]))
      for(m in 1:(length(elp)-1)){
        l[ elp[m]:elp[m+1] ] <- s[m]
      }
    } else {
      l[1:80] <- 0
      l <- rep(0, 80)
    }
    l <- append(l, rep(NA, 120-length(l)))
    mat <- rbind(mat, l)
  }
}
library(rstan)
# 同時に
code <- "
data{
  int Ngame;
  int M;
  int Ngoal[Ngame];
  vector<lower=-1>[M] G[Ngame];
}
parameters{
  real<lower=0, upper=10> lambda;
  real<lower=0, upper=5> p[2];
}
model{
  Ngoal ~ poisson(lambda);
  for(i in 1:Ngame){
    G[i, 1:Ngoal[i] ] ~ gamma(p[1], p[2]);
  }
}
"

m <- stan_model(model_code=code)
Ng <- sapply(goaltime, length)
game <- t(mapply(function(z) append(z, rep(-1, max(Ng)-length(z))), goaltime))
dat <- list(Ngame=length(Ng), Ngoal=Ng, M=max(Ng), G=game)
fit <- sampling(m, data=dat, iter=3000, warmup=1500, seed=1234)
ex <- extract(fit)


par(mfrow=c(1, 2), las=1, mar=c(5, 4.5, 2, 2), cex.lab=1.5)
goaltime <- mapply(function(z) head(rle(as.numeric(na.omit(mat[z,])))$length, -1), seq(nrow(mat)))
g <- unlist(goaltime)
x <- 1:100
tab <- table(factor(g, x))

dg <- dgamma(x, median(ex$p[,1]), median(ex$p[,2]))
plot(tab/length(g)*100, xlab="直前のゴールからの経過時間（分）", ylab="頻度 [%]")
lines(x, dg*100, col=2, lwd=3)

x <- 0:max(Ng)
Ngd <- table(factor(Ng, x))
dp <- dpois(x, median(ex$lambda))
plot(Ngd/length(Ng)*100, xlab="1試合の総得点数", ylab="頻度 [%]")
lines(x, dp*100, col=2, lwd=3)


# ある時刻での勝利確率を求める
mat80 <- mat[is.na(mat[,90]),]
s <- replicate(2, matrix(0, 2, 90), simplify=FALSE)
for(k in 1:2){
  for(i in 1:nrow(mat80)){
    a <- abs(mat80[i,])
    M <- sum(!is.na(a))
    for(j in 1:M){
      if(a[j] >= k){
        s[[k]][2, j] <- s[[k]][2, j] + 1
        if(all(a[j:M] > 0)){
          s[[k]][1, j] <- s[[k]][1, j] + 1
        }
      }
    }
  }
}

xl <- c(0, 90)
yl <- c(0.5, 1)
cols <- c("blue", "red")
plot(0, type="n", xlim=xl, ylim=yl, xlab="試合経過時間（分）", ylab="試合終了時までリードし続ける確率", cex.lab=1.2, cex.axis=1.2, las=1)
for(i in seq(s)){
  v <- s[[i]][1,]/s[[i]][2,]
  points(v, col=cols[i], pch=16)
  d <- drm(v ~ seq(v), fct=AR.3())
  lines(d$data[,1], d$predres[,1], lwd=3)
}
legend("bottomright", legend=sprintf("%d 点差", 1:2), col=cols, pch=16, cex=2)

　
データ取得と前処理

# 2017-2015 年までの96-93 大会
wd <- "/soccer/"
dir.create(wd)
y <- 2017:2014
m <- 1:47

for(i in y){
  urls <- sprintf("http://www.jfa.jp/match/alljapan_highschool_%d/match_report/m%d.pdf", i, m)
  dir.create(sprintf("%s%d", wd, i))
  for(j in urls){
    command <- sprintf("wget %s -P %s%d", j, wd, i)
    system(command)
  }
}

for(i in 2013){
  urls <- sprintf("https://www.jfa.or.jp/match/matches/2014/0113koukou_soccer/match_report/m%d.pdf", m)
  dir.create(sprintf("%s%d", wd, i))
  for(j in urls){
    command <- sprintf("wget %s -P %s%d", j, wd, i)
    system(command)
  }
}

y <- 2012:2008
d1 <- c("0114", "0110", "0110", "0110", "")
for(i in seq(y)){
  urls <- sprintf("http://www.jfa.or.jp/match/matches/%d/%skoukou_soccer/schedule_result/pdf/m%02d.pdf", y[i]+1, d1[i], m)
  dir.create(sprintf("%s%d", wd, y[i]))
  for(j in urls){
    command <- sprintf("wget %s -P %s%d", j, wd, y[i])
    system(command)
  }
}
system(sprintf("mv %s%d/* %s%d/; rm -r %s%d", wd, 2008, wd, 2009, wd, 2008))

###
library(pdftools)
library(stringr)
year <- list.files(wd)
year <- as.character(2017:2013)
# 2013-2017
output <- NULL
for(j in seq(year)){
  print(sprintf("%s", year[j]))
  files <- list.files(sprintf("%s%s", wd, year[j]), pattern="pdf", recursive=TRUE, full.names=TRUE)
  for(f in seq(files)){
  text <- pdf_text(files[f])
  text <- strsplit(text, "\n")[[1]]
  i <- 1
  while(i < length(text)){
    if(any(strsplit(text[i], " ")[[1]] == "得点時間")){
      i <- i + 1
      res <- NULL
      while(length(grep("PK戦の経過", text[i])) < 1){
        tmp <- strsplit(gsub(" +", " ", text[i]), " ")[[1]][2:3] if(length(grep("分", tmp)) > 0){
          res <- rbind(res, tmp)
        }
        i <- i + 1
      }
      tmp <- table(res[,2])
      if(length(tmp) > 1){
        if(tmp[1] == tmp[2]){
          i <- i + 1
          a1 <- strsplit(gsub(" +", " ", text[i]), " ")[[1]]
          a1 <- a1[a1 != " "]
          i <- i + 1
          a2 <- strsplit(gsub(" +", " ", text[i]), " ")[[1]]
          a2 <- a2[a2 != " "]
          a3 <- c(a1[1], a2[1])[which.max(c(length(grep("○", a1)), length(grep("○", a2))))]
          res <- cbind(res, (res[,2] == a3)+0)
        } else {
          i <- i + 100
          a3 <- names(tmp)[which.max(tmp)]
          res <- cbind(res, (res[,2] == a3)+0)
        }
      } else {
        res <- cbind(res, 1)
      }
    }
    i <- i + 1
  }
  res <- switch(1 + (length(res)==1), res, matrix(c(NA, NA, res), 1))
  res <- cbind(year[j], f, res)
  output <- rbind(output, res)
  }
}

year <- as.character(2012:2009)
for(j in seq(year)){
  files <- list.files(sprintf("%s%s", wd, year[j]), pattern="pdf", recursive=TRUE, full.names=TRUE)
  for(f in seq(files)){
  text <- pdf_text(files[f])
  text <- strsplit(text, "\n")[[1]]
  if(length(text) > 0){
    i <- 1
    while(i < length(text)){
      pktext <- str_extract(text[i], "先.*先")
      if(!is.na(pktext)){
        pk <- as.numeric(str_extract_all(pktext, "\\d+")[[1]])
      }
      if(any(strsplit(text[i], " ")[[1]] == "得点チーム")){
        i <- i + 1
        res <- NULL
        while(length(grep("備考欄|戦評者氏名", text[i])) < 1){
          tmp <- strsplit(gsub(" +", " ", text[i]), " ")[[1]]
          tmp <- tmp[grep("分", tmp)]
          if(length(grep("分", tmp)) > 0){
            tmp <- strsplit(gsub("分", "分 ", tmp), " ")[[1]]
            res <- rbind(res, tmp)
          }
          i <- i + 1
        }
        tmp <- table(res[,2])
        if(length(tmp) > 1){
          if(tmp[1] == tmp[2]){
            i <- i + 1
            a1 <- strsplit(gsub(" +", " ", text[i]), " ")[[1]]
            a1 <- a1[a1 != " "]
            i <- i + 1
            a2 <- strsplit(gsub(" +", " ", text[i]), " ")[[1]]
            a2 <- a2[a2 != " "]
            a3 <- c(a1[1], a2[1])[which.max(c(length(grep("○", a1)), length(grep("○", a2))))]
            res <- cbind(res, (res[,2] == a3)+0)
          } else {
            i <- i + 100
            a3 <- names(tmp)[which.max(tmp)]
            res <- cbind(res, (res[,2] == a3)+0)
          }
        } else {
          res <- cbind(res, 1)
        }
      }
      i <- i + 1
    }
    res <- switch(1 + (length(res)==1), res, matrix(c(NA, NA, res), 1))
    res <- cbind(year[j], f, res)
    output <- rbind(output, res)
    } else {
      print(files[f])
    }
  }
}