#########################################
# #
# Code for A. sediba-Homo paper #
# #
#########################################
# Author: Andrew Du
# Date: 10-5-18 (revised 12-26-18)
## Figure 3: calculating the probability of obtaining an age difference between ancestor & descendant at least as great as that seen between A. sediba & earliest Homo, assuming some hominin temporal duration
# Probability model
# ARGUMENTS (using notation from Equation 5c)
# T_o: vector of overlap values (in absolute number of years)
# T_d: observed age difference between A. sediba and Homo fossil horizons
# T_R: assumed hominin temporal duration
prob.model.pval <- function(T_o, T_d, T_R){
p.value <- (T_o - T_d) ^ 2 / (2 * T_R ^ 2) # Equation 5c from paper
p.value[T_d > T_o] <- 0 # set all values where the observed age difference is greater than overlap to zero
names(p.value) <- T_o
return(p.value)
}
# Establish arguments for p-value function
obs.diff.LG <- (2.8 + 2.75) / 2 - 1.977 # observed age difference between A.sediba & Ledi-Geraru mandible (using the midpoint for the latter)
hom.dur_Robinson <- 0.97 # mean hominin temporal duration from Robinson et al. 2018
prop.overlap <- seq(0, 1, 0.01) # proportion overlap between ancestral & descendant ranges
# Run analyses
LG_Rob_pvals <- prob.model.pval(T_o = hom.dur_Robinson * prop.overlap, T_d = obs.diff.LG, T_R = hom.dur_Robinson)
# Plot results
par(oma = c(0, 0.2, 1, 0))
plot(prop.overlap * hom.dur_Robinson, LG_Rob_pvals, type = "l", lwd = 7, xlab = "Absolute range overlap (millions of years)", ylab = expression(italic(P) * "-value"), cex.axis = 1.5, cex.lab = 1.5)
axis(3, at = seq(0, 1, 0.2) * hom.dur_Robinson, labels = seq(0, 1, 0.2), cex.axis = 1.5)
mtext("Relative range overlap (proportion of total range)", side = 3, cex = 1.5, line = 3)
## Expected P-value if all temporal overlap values are treated as equally likely
# ARGUMENTS (using notation from Equation 6c)
# T_d: observed age difference between A. sediba and Homo fossil horizons
# T_R: assumed hominin temporal duration
expect.pval <- function(T_d, T_R){
p <- (T_R - T_d) ^ 3 / (6 * T_R ^ 3) # Equation 6c from paper
return(p)
}
expect.pval(T_d = obs.diff.LG, T_R = hom.dur_Robinson) # 0.0009
## Figure 4: comparing age difference between A. sediba & earliest Homo to differences between ages of first-discovered fossils in hypothesized ancestor-descendant species pairs
# Read in data
ages <- read.csv("aav9038_Data_file_S2.csv", header = TRUE)
anc.desc <- read.csv("aav9038_Data_file_S1.csv", header = TRUE)
# calculate age differences between ancestor and descendants using midpoint ages
obs.anc.desc.diff <- numeric(length = nrow(anc.desc)) # empty vector to save results to
for(i in seq_along(obs.anc.desc.diff)){ # iterate through recorded ancestor-descendant pairs
anc <- as.character(anc.desc$ancestor[i]) # get out ancestor species
desc <- as.character(anc.desc$descendant[i]) # get out descendant species
# calculate age midpoints
anc.midpoint <- (ages$lower.age[ages$species == anc] + ages$upper.age[ages$species == anc]) / 2
desc.midpoint <- (ages$lower.age[ages$species == desc] + ages$upper.age[ages$species == desc]) / 2
obs.anc.desc.diff[i] <- desc.midpoint - anc.midpoint # calculate difference. Negative values represent ancestor horizons that are older
}
pnorm(obs.diff.LG, mean = mean(obs.anc.desc.diff), sd = sd(obs.anc.desc.diff)) * 100 # observed age difference between A. sediba and earliest Homo is in the >99.9 percentile
# Plot results
par(mar = c(5 + 1, 4, 4, 2) + 0.1)
h <- hist(obs.anc.desc.diff, col = "gray", breaks = 10, xlim = c(-2, 1), ylim = c(0, 10), ylab = "Number of ancestor-descendant species pairs", xlab = "", main = "", cex.lab = 1.5, cex.axis = 1.5)
mtext("Age difference between first-discovered fossils\nin ancestor and descendant (millions of years)", side = 1, line = 4.5, cex = 1.5)
# fit normal density curve
x <- seq(-2, 1, length.out = 1000)
y <- dnorm(x = x, mean = mean(obs.anc.desc.diff), sd = sd(obs.anc.desc.diff))
lines(x, y * (max(h$counts) + 1), lwd = 2)
arrows(x0 = obs.diff.LG, y0 = 2, y1 = 0, lwd = 3)
abline(v = 0, lty = 2, lwd = 2)
text(-1, 9.25, "Ancestor fossil older", cex = 1.25)
text(0.6, 8.5, "Ancestor fossil\nyounger", pos = 3, cex = 1.25)
# Figure S1: simulations to confirm our probability model
# This function randomly samples a fossil horizon each from an ancestor and descendant temporal range (according to the assumptions in the arguments) and calculates the proportion of age differences that are at least as great as that observed between A. sediba and Homo (i.e., a P-value)
# ARGUMENTS
# T_o: vector of overlap values (in absolute number of years)
# T_d: observed age difference between A. sediba and Homo fossil horizons
# T_R: assumed hominin temporal duration
# n.iter: number of iterations
simul.pval <- function(T_o, T_d, T_R, n.iter){
anc.orig <- 0 # ancestor's origination age
anc.extinct <- T_R # ancestor's extinction age
p.value <- numeric(length = length(T_o)) # empty vector to save p-value results to
names(p.value) <- T_o
for(i in seq_along(T_o)){
anc.foss.hor <- runif(n.iter, min = anc.orig, max = anc.extinct) # randomly select one fossil horizon from ancestral range n.iter times
desc.orig <- anc.extinct - T_o[i] # descendant's origination age
desc.extinct <- desc.orig + T_R # descendant's extinction age
desc.foss.hor <- runif(n.iter, min = desc.orig, max = desc.extinct) # randomly select one fossil horizon from descendant range n.iter times
anc.desc.diff <- anc.foss.hor - desc.foss.hor
p.value[i] <- sum(anc.desc.diff >= T_d) / n.iter
print(i)
}
return(p.value)
}
n.iter <- 100000 # 100,000 iterations
# Double-check main probability model (Eq. 5c)
sim_LG_Rob_pvals <- simul.pval(T_o = hom.dur_Robinson * prop.overlap, T_d = obs.diff.LG, T_R = hom.dur_Robinson, n.iter = n.iter)
# Plot results
plot(LG_Rob_pvals, sim_LG_Rob_pvals, xlab = expression("Probability model" ~ italic(P) * "-values"), ylab = expression("Simulation model " ~ italic(P) * "-values"))
abline(0, 1)
# Double-check expected value probability model (Eq. 6c). Run simulations over 1,000 values of overlap and take the average of the 1,000 P-values
sim.expect.pval <- simul.pval(T_o = hom.dur_Robinson * seq(0, 1, length.out = 1000), T_d = obs.diff.LG, T_R = hom.dur_Robinson, n.iter = n.iter)
mean(sim.expect.pval) # 0.0009, which matches the value from Eq. 6c
# Figure S2: sensitivity analyses looking at different hominin temporal durations and using AL 666-1 as the first appearance of Homo.
obs.diff.AL666 <- 2.33 - 1.977 # observed age difference between A.sediba & AL 666-1 maxilla
hom.dur_2Myr <- 2 # use 2 Myr for hominin duration
LG_2Myr_pvals <- prob.model.pval(T_o = hom.dur_2Myr * prop.overlap, T_d = obs.diff.LG, T_R = hom.dur_2Myr)
expect.pval(T_d = obs.diff.LG, T_R = hom.dur_2Myr) # 0.036
sim_LG_2Myr_pvals <- simul.pval(T_o = hom.dur_2Myr * seq(0, 1, length.out = 1000), T_d = obs.diff.LG, T_R = hom.dur_2Myr, n.iter = n.iter) # simulate 1,000 p-values
mean(sim_LG_2Myr_pvals) # 0.036, which matches the calculated value
AL666_Rob_pvals <- prob.model.pval(T_o = hom.dur_Robinson * prop.overlap, T_d = obs.diff.AL666, T_R = hom.dur_Robinson)
expect.pval(T_d = obs.diff.AL666, T_R = hom.dur_Robinson) # 0.043
sim_AL666_Rob_pvals <- simul.pval(T_o = hom.dur_Robinson * seq(0, 1, length.out = 1000), T_d = obs.diff.AL666, T_R = hom.dur_Robinson, n.iter = n.iter) # simulate 1,000 p-values
mean(sim_AL666_Rob_pvals) # 0.043, which matches the calculated value
AL666_2Myr_pvals <- prob.model.pval(T_o = hom.dur_2Myr * prop.overlap, T_d = obs.diff.AL666, T_R = hom.dur_2Myr)
expect.pval(T_d = obs.diff.AL666, T_R = hom.dur_2Myr) # 0.093
sim_AL666_2Myr_pvals <- simul.pval(T_o = hom.dur_2Myr * seq(0, 1, length.out = 1000), T_d = obs.diff.AL666, T_R = hom.dur_2Myr, n.iter = n.iter) # simulate 1,000 p-values
mean(sim_AL666_2Myr_pvals) # 0.093, which matches the calculated value
# Plot results
par(mfrow = c(1, 3), mar = c(5, 0, 4, 0) + 0.1, oma = c(0, 5.5, 2, 0))
plot(prop.overlap * hom.dur_2Myr, LG_2Myr_pvals, type = "l", lwd = 7, xlab = "Absolute range overlap (millions of years)", ylab = "", cex.axis = 2, cex.lab = 2, ylim = c(0, 0.35))
axis(3, at = seq(0, 1, 0.2) * hom.dur_2Myr, labels = seq(0, 1, 0.2), cex.axis = 2)
mtext("Relative range overlap (proportion of total range)", side = 3, cex = 1.4, line = 3)
text(0, 0.35, "A", cex = 4, font = 2)
mtext(expression(italic(P) * "-value"), side = 2, line = 3.5, cex = 1.4)
plot(prop.overlap * hom.dur_Robinson, AL666_Rob_pvals, type = "l", lwd = 7, xlab = "Absolute range overlap (millions of years)", ylab = "", cex.axis = 2, cex.lab = 2, ylim = c(0, 0.35), yaxt = "n")
axis(3, at = seq(0, 1, 0.2) * hom.dur_Robinson, labels = seq(0, 1, 0.2), cex.axis = 2)
mtext("Relative range overlap (proportion of total range)", side = 3, cex = 1.4, line = 3)
text(0, 0.35, "B", cex = 4, font = 2)
plot(prop.overlap * hom.dur_2Myr, AL666_2Myr_pvals, type = "l", lwd = 7, xlab = "Absolute range overlap (millions of years)", ylab = "", cex.axis = 2, cex.lab = 2, ylim = c(0, 0.35), yaxt = "n")
axis(3, at = seq(0, 1, 0.2) * hom.dur_2Myr, labels = seq(0, 1, 0.2), cex.axis = 2)
mtext("Relative range overlap (proportion of total range)", side = 3, cex = 1.4, line = 3)
text(0, 0.35, "C", cex = 4, font = 2)
## Figure S3: looking at distribution of fossil recovery potential through time in South Africa & eastern Africa
# read in data
hom.mbr <- read.csv("aav9038_Data_file_S3.csv", header = TRUE)
# see if any member ages are duplicated. If so, remove them
SA <- hom.mbr[hom.mbr$region == "South Africa", ]
sa.mid <- (SA$lower.age + SA$upper.age) / 2
any(duplicated(sa.mid)) # no duplicated South African ages
EA <- hom.mbr[hom.mbr$region == "eastern Africa", ]
ea.mid <- (EA$lower.age + EA$upper.age) / 2
any(duplicated(ea.mid)) # there are duplicated eastern African ages. Remove them.
EA <- EA[!duplicated(ea.mid), ]
# Uniform QQ plot (code from Du et al., resubmitted to JHE)
qq.plot <- function(lower.age, upper.age, n.iter = 100000, PLOT = TRUE, lm.lty = 1, lm.lwd = 2, ci.lty = 2, ci.lwd = 2, ...){
mid <- (lower.age + upper.age) / 2
n <- length(mid)
rand.unif <- matrix(runif(n.iter * n, 0, 1), nrow = n.iter, ncol = n) # random draw of n data points (observed number of fossil horizons) from standard uniform distribution n.iter times
rand.unif <- t(apply(rand.unif, 1, sort)) # sort each vector
quant.unif <- qunif(ppoints(n, 0)) # expected quantiles from a standard uniform distribution
scale.loc.lm <- lm(sort(mid, decreasing = TRUE) ~ quant.unif) # for scaling the simulated uniform values to plot as confidence bounds. Can do this because the uniform distribution is a location-scale family
ci <- apply(rand.unif, 2, function(x) quantile(x, c(0.025, 0.975))) # calculate 95% confidence bounds
ci <- ci * scale.loc.lm$coefficients[2] + scale.loc.lm$coefficients[1] # scale the 95% confidence bounds
if(PLOT){
plot(quant.unif, sort(mid, decreasing = TRUE), ylim = rev(range(c(lower.age, upper.age))), ...) # plot observed midpoint ages vs. uniform quantiles
segments(x0 = quant.unif, y0 = lower.age[order(mid, decreasing = TRUE)], y1 = upper.age[order(mid, decreasing = TRUE)], lwd = 1.5) # error bars of observed bracketing ages
abline(scale.loc.lm, lwd = lm.lwd, lty = lm.lty) # line of best fit
lines(quant.unif, ci[1, ], lwd = ci.lwd, lty = ci.lty) # lower confidence bound
lines(quant.unif, ci[2, ], lwd = ci.lwd, lty = ci.lty) # upper confidence bound
} else return(list(quant.unif = quant.unif, ci = ci, scale.loc.lm = scale.loc.lm))
}
# create the figure
par(mfrow = c(1, 2), mar = c(5, 4, 4, 0.1) + 0.1)
# South African plot
qq.plot(lower.age = SA$lower.age, upper.age = SA$upper.age, main = "", ci.lty = 1, pch = 16, cex = 1.25, xlab = "Uniform quantiles", ylab = "Observed ages (millions of years ago)", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5)
abline(h = 1.977 + 0.5, lty = 2, lwd = 2)
abline(h = 1.977 - 0.5, lty = 2, lwd = 2)
text(0.66, 3.65, "South Africa", cex = 2, pos = 4)
text(0.08, 0.7, expression(bold("A")), cex = 3)
# eastern African plot
qq.plot(lower.age = EA$lower.age, upper.age = EA$upper.age, main = "", ci.lty = 1, pch = 16, cex = 1.25, xlab = "Uniform quantiles", ylab = "", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5)
abline(h = (2.8 + 2.75) / 2 + 0.5, lty = 2, lwd = 2)
abline(h = (2.8 + 2.75) / 2 - 0.5, lty = 2, lwd = 2)
text(0.635, 4.35, "eastern Africa", cex = 2, pos = 4)
text(0.04, 0.87, expression(bold("B")), cex = 3)
## Figure S4: re-calculate P-value by doubling FRP during the last 25% of the ancestor's (i.e., A. sediba) range
# This function randomly samples a fossil horizon each from an ancestor and descendant temporal range (according to the assumptions in the arguments), but the probability of sampling a fossil from the last 25% of the ancestor's range is doubled. The function then calculates the proportion of age differences that are at least as great as that observed between A. sediba and Homo (i.e., a P-value)
# ARGUMENTS
# T_o: vector of overlap values (in absolute number of years)
# T_d: observed age difference between A. sediba and Homo fossil horizons
# T_R: assumed hominin temporal duration
# n.iter: number of iterations
simul.pval.SI <- function(T_o, T_d, T_R, n.iter){
anc.orig <- 0 # ancestor's origination age
anc.extinct <- T_R # ancestor's extinction age
p.value <- numeric(length = length(T_o)) # empty vector to save p-value results to
names(p.value) <- T_o
for(i in seq_along(T_o)){
# there's a 60% chance the fossil horizon will be sampled in the first 75% of the temporal range, so make 60% of the iterations fall in the first 75% of the range
anc.foss.hor1 <- runif(n.iter * 0.6, min = anc.orig, max = anc.extinct * 0.75)
# make 40% of iterations fall in the last 25% of the range
anc.foss.hor2 <- runif(n.iter * 0.4, min = anc.extinct * 0.75, max = anc.extinct)
# combine the two
anc.foss.hor <- c(anc.foss.hor1, anc.foss.hor2)
desc.orig <- anc.extinct - T_o[i] # descendant's origination age
desc.extinct <- desc.orig + T_R # descendant's extinction age
desc.foss.hor <- runif(n.iter, min = desc.orig, max = desc.extinct) # randomly select one fossil horizon from descendant range n.iter times
anc.desc.diff <- anc.foss.hor - desc.foss.hor
p.value[i] <- sum(anc.desc.diff >= T_d) / n.iter
print(i)
}
return(p.value)
}
p.vals.SI <- simul.pval.SI(T_o = prop.overlap * hom.dur_Robinson, T_d = obs.diff.LG, T_R = hom.dur_Robinson, n.iter)
# Plot the results
library(pBrackets) # for drawing curly brackets
par(mar = c(5, 4 + 0.5, 4 + 0.5, 2) + 0.1)
plot(prop.overlap * hom.dur_Robinson, p.vals.SI, type = "l", lwd = 3, xlab = "Absolute range overlap (millions of years)", ylab = expression(italic(P) * "-value"), cex.axis = 1.5, cex.lab = 1.5)
lines(prop.overlap * hom.dur_Robinson, LG_Rob_pvals, lwd = 3, lty = 2)
axis(3, at = seq(0, 1, 0.2) * hom.dur_Robinson, labels = seq(0, 1, 0.2), cex.axis = 1.5)
mtext("Relative range overlap (proportion of total range)", side = 3, cex = 1.4, line = 3)
legend("topleft", c("Doubled FRP in last 25% of range", "Uniform FRP"), lty = c(1, 2), lwd = c(2, 2), cex = 1.25, bty = "n")
# inset describing how we doubled the FRP for the last 25% of the range
rect(xleft = 0.125, xright = 0.675, ybottom = 0.01 - 0.002, ytop = 0.014 - 0.002, col = "gray", border = "black", lwd = 2)
segments(x0 = 0.125 + (0.675 - 0.125) / 4, y0 = 0.01 - 0.002, y1 = 0.014 - 0.002, lwd = 2)
segments(x0 = 0.125 + 2 * (0.675 - 0.125) / 4, y0 = 0.01 - 0.002, y1 = 0.014 - 0.002, lwd = 2)
segments(x0 = 0.125 + 3 * (0.675 - 0.125) / 4, y0 = 0.01 - 0.002, y1 = 0.014 - 0.002, lwd = 2)
rect(xleft = 0.125 + 3 * (0.675 - 0.125) / 4, xright = 0.675, ybottom = 0.01 - 0.002, ytop = 0.014 - 0.002, col = "gray30", border = "black", lwd = 2)
text("0.2", x = 0.125 + (0.675 - 0.125) / 8, y = 0.015 - 0.002, cex = 1.5)
text("0.2", x = 0.125 + (0.675 - 0.125) / 4 + (0.675 - 0.125) / 8, y = 0.015 - 0.002, cex = 1.5)
text("0.2", x = 0.125 + 2 * (0.675 - 0.125) / 4 + (0.675 - 0.125) / 8, y = 0.015 - 0.002, cex = 1.5)
text("0.4", x = 0.675 - (0.675 - 0.125) / 8, y = 0.0155 - 0.002, cex = 2.5)
brackets(x1 = 0.125 + (0.675 - 0.125) / 8, x2 = 0.125 + 2 * (0.675 - 0.125) / 4 + (0.675 - 0.125) / 8, y1 = 0.016 - 0.002, y2 = 0.016 - 0.002, h = 0.002, type = 1, lwd = 2)
text("0.6", x = 0.125 + (0.675 - 0.125) / 4 + (0.675 - 0.125) / 8, y = 0.0195 - 0.002, cex = 2.5)
text("Time", x = (0.125 + 0.675) / 2, y = 0.0085 - 0.002, cex = 1.5)
arrows(x0 = 0.25, x1 = 0.55, y0 = 0.007 - 0.002, lwd = 2)
rect(xleft = 0.05, xright = 0.75, ybottom = 0.005 - 0.002, ytop = 0.022 - 0.002, lwd = 2)