Let us apply bootstrapping to the estimation of the uncertainty of phylogenetic trees estimates.
This will be more challenging than confidence intervals for scalar random variables!
library(ape)
library(phangorn)
library(seqinr)
##
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
mammals <- read.dna("http://evolution.gs.washington.edu/gs570/2016/data/ADAM7.dna", format="interleaved")
mammals_phyDat <- phyDat(mammals, type = "DNA", levels = NULL)
mammals10 <- subset(mammals_phyDat, 1:10)
mammals10_phyDat <- phyDat(mammals10, type = "DNA", levels = NULL)
mt <- modelTest(mammals10)
## [1] "JC+I"
## [1] "JC+G"
## [1] "JC+G+I"
## [1] "F81+I"
## [1] "F81+G"
## [1] "F81+G+I"
## [1] "K80+I"
## [1] "K80+G"
## [1] "K80+G+I"
## [1] "HKY+I"
## [1] "HKY+G"
## [1] "HKY+G+I"
## [1] "SYM+I"
## [1] "SYM+G"
## [1] "SYM+G+I"
## [1] "GTR+I"
## [1] "GTR+G"
## [1] "GTR+G+I"
print(mt)
## Model df logLik AIC AICw AICc AICcw BIC
## 1 JC 17 -12409.15 24852.30 1.372439e-245 24852.57 1.640936e-245 24950.12
## 2 JC+I 18 -12286.82 24609.63 6.796619e-193 24609.93 8.000264e-193 24713.21
## 3 JC+G 18 -12276.50 24589.00 2.051643e-188 24589.30 2.414978e-188 24692.58
## 4 JC+G+I 19 -12276.48 24590.97 7.681938e-189 24591.30 8.894330e-189 24700.30
## 5 F81 20 -12367.20 24774.39 1.135569e-228 24774.76 1.292125e-228 24889.48
## 6 F81+I 21 -12245.51 24533.01 2.957672e-176 24533.41 3.304513e-176 24653.85
## 7 F81+G 21 -12235.26 24512.51 8.356020e-172 24512.91 9.335915e-172 24633.35
## 8 F81+G+I 22 -12235.25 24514.49 3.104585e-172 24514.93 3.402858e-172 24641.08
## 9 K80 18 -12070.50 24177.00 5.979532e-99 24177.30 7.038476e-99 24280.58
## 10 K80+I 19 -11938.26 23914.51 5.977188e-42 23914.84 6.920530e-42 24023.84
## 11 K80+G 19 -11923.33 23884.65 1.821051e-35 23884.98 2.108456e-35 23993.98
## 12 K80+G+I 20 -11923.34 23886.68 6.589812e-36 23887.05 7.498325e-36 24001.77
## 13 HKY 21 -12016.92 24075.84 5.551978e-77 24076.24 6.203048e-77 24196.67
## 14 HKY+I 22 -11883.17 23810.34 2.496458e-19 23810.78 2.736306e-19 23936.93
## 15 HKY+G 22 -11868.75 23781.51 4.551019e-13 23781.94 4.988258e-13 23908.10
## 16 HKY+G+I 23 -11868.77 23783.54 1.642195e-13 23784.02 1.764266e-13 23915.89
## 17 SYM 22 -12041.43 24126.86 4.621269e-88 24127.30 5.065258e-88 24253.45
## 18 SYM+I 23 -11913.14 23872.27 8.880649e-33 23872.75 9.540787e-33 24004.62
## 19 SYM+G 23 -11894.89 23835.78 7.446021e-25 23836.26 7.999516e-25 23968.13
## 20 SYM+G+I 24 -11894.93 23837.86 2.630563e-25 23838.38 2.767600e-25 23975.96
## 21 GTR 25 -11979.89 24009.79 1.222781e-62 24010.35 1.258732e-62 24153.64
## 22 GTR+I 26 -11856.82 23765.64 1.269127e-09 23766.25 1.277130e-09 23915.24
## 23 GTR+G 26 -11836.65 23725.30 7.311252e-01 23725.91 7.357353e-01 23874.90
## 24 GTR+G+I 27 -11836.65 23727.30 2.688748e-01 23727.95 2.642647e-01 23882.66
dna_dist <- dist.ml(mammals10, model="JC69")
We will use various techniques: Neighbor Joining, UPGMA, and Maximum Parsimony
dm <- dna_dist
mammals_UPGMA <- upgma(dm)
mammals_NJ <- NJ(dm)
plot(mammals_UPGMA, main="UPGMA")
plot(mammals_NJ, main = "Neighbor Joining")
What is the best model that represents DNA (nucleotides) evolution?
parsimony(mammals_UPGMA,mammals10)
## [1] 1973
parsimony(mammals_NJ,mammals10)
## [1] 1927
mammals_optim <- optim.parsimony(mammals_NJ, mammals10)
## Final p-score 1927 after 0 nni operations
mammals_pratchet <- pratchet(mammals10)
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
## [1] "Best pscore so far: 1927"
plot(mammals_optim)
plot(mammals_pratchet)
We will use the R function optim.pml() which optimizes the tree topology and branch length for the model you have chosen.
fit <- pml(mammals_NJ, mammals10)
print(fit)
##
## loglikelihood: -12354.97
##
## unconstrained loglikelihood: -17315.56
##
## Rate matrix:
## a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
##
## Base frequencies:
## 0.25 0.25 0.25 0.25
fitJC <- optim.pml(fit, model = "JC", rearrangement = "stochastic")
## optimize edge weights: -12354.97 --> -12335.12
## optimize edge weights: -12335.12 --> -12335.12
## optimize topology: -12335.12 --> -12335.12
## 0
## [1] "Ratchet iteration 1 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 2 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 3 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 4 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 5 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 6 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 7 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 8 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 9 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 10 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 11 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 12 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 13 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 14 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 15 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 16 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 17 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 18 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 19 , best pscore so far: -12335.1159969643"
## [1] "Ratchet iteration 20 , best pscore so far: -12335.1159969643"
## optimize edge weights: -12335.12 --> -12335.12
## optimize topology: -12335.12 --> -12335.12
## 0
## optimize edge weights: -12335.12 --> -12335.12
logLik(fitJC)
## 'log Lik.' -12335.12 (df=17)
Now let us estimate uncertainty by bootstrapping
bs <- bootstrap.pml(fitJC, bs=100, optNni=TRUE, multicore=FALSE, control = pml.control(trace=0))
## Warning in if (!is.na(tmp)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (tmp == 1) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (tmp == 2) do_rearr <- extras$rearrangement %in% c("NNI", : the
## condition has length > 1 and only the first element will be used
## Final p-score 2033 after 0 nni operations
## Final p-score 1953 after 0 nni operations
## Final p-score 1845 after 2 nni operations
## Final p-score 1885 after 0 nni operations
## Final p-score 1883 after 0 nni operations
## Final p-score 1918 after 1 nni operations
## Final p-score 1954 after 0 nni operations
## Final p-score 1956 after 0 nni operations
## Final p-score 1903 after 0 nni operations
## Final p-score 1989 after 0 nni operations
## Final p-score 1971 after 0 nni operations
## Final p-score 1985 after 4 nni operations
## Final p-score 1875 after 0 nni operations
## Final p-score 1975 after 0 nni operations
## Final p-score 1978 after 1 nni operations
## Final p-score 1924 after 1 nni operations
## Final p-score 1892 after 1 nni operations
## Final p-score 1939 after 0 nni operations
## Final p-score 1948 after 0 nni operations
## Final p-score 1975 after 0 nni operations
## Final p-score 1904 after 0 nni operations
## Final p-score 1876 after 0 nni operations
## Final p-score 1870 after 1 nni operations
## Final p-score 1901 after 0 nni operations
## Final p-score 1966 after 0 nni operations
## Final p-score 1973 after 0 nni operations
## Final p-score 1979 after 0 nni operations
## Final p-score 1934 after 0 nni operations
## Final p-score 1949 after 1 nni operations
## Final p-score 1947 after 1 nni operations
## Final p-score 1959 after 1 nni operations
## Final p-score 1956 after 0 nni operations
## Final p-score 2015 after 0 nni operations
## Final p-score 1971 after 0 nni operations
## Final p-score 1912 after 0 nni operations
## Final p-score 1969 after 0 nni operations
## Final p-score 1849 after 0 nni operations
## Final p-score 1965 after 0 nni operations
## Final p-score 1929 after 0 nni operations
## Final p-score 1950 after 0 nni operations
## Final p-score 1904 after 0 nni operations
## Final p-score 1940 after 0 nni operations
## Final p-score 1915 after 2 nni operations
## Final p-score 1958 after 0 nni operations
## Final p-score 1997 after 0 nni operations
## Final p-score 1940 after 0 nni operations
## Final p-score 1938 after 0 nni operations
## Final p-score 1959 after 0 nni operations
## Final p-score 1998 after 0 nni operations
## Final p-score 1979 after 0 nni operations
## Final p-score 1903 after 0 nni operations
## Final p-score 1918 after 1 nni operations
## Final p-score 1877 after 0 nni operations
## Final p-score 1920 after 1 nni operations
## Final p-score 1913 after 0 nni operations
## Final p-score 1941 after 1 nni operations
## Final p-score 1920 after 0 nni operations
## Final p-score 1995 after 1 nni operations
## Final p-score 1903 after 1 nni operations
## Final p-score 1881 after 0 nni operations
## Final p-score 1969 after 1 nni operations
## Final p-score 1916 after 0 nni operations
## Final p-score 1931 after 0 nni operations
## Final p-score 1927 after 0 nni operations
## Final p-score 1886 after 0 nni operations
## Final p-score 1985 after 1 nni operations
## Final p-score 2058 after 0 nni operations
## Final p-score 1899 after 0 nni operations
## Final p-score 2014 after 1 nni operations
## Final p-score 1894 after 1 nni operations
## Final p-score 1914 after 1 nni operations
## Final p-score 1871 after 0 nni operations
## Final p-score 1872 after 0 nni operations
## Final p-score 1928 after 0 nni operations
## Final p-score 1960 after 0 nni operations
## Final p-score 1988 after 0 nni operations
## Final p-score 1891 after 0 nni operations
## Final p-score 1915 after 0 nni operations
## Final p-score 1873 after 0 nni operations
## Final p-score 1902 after 1 nni operations
## Final p-score 1975 after 1 nni operations
## Final p-score 1858 after 2 nni operations
## Final p-score 2013 after 0 nni operations
## Final p-score 1926 after 0 nni operations
## Final p-score 1894 after 0 nni operations
## Final p-score 1949 after 0 nni operations
## Final p-score 1925 after 0 nni operations
## Final p-score 1904 after 0 nni operations
## Final p-score 1886 after 0 nni operations
## Final p-score 1997 after 0 nni operations
## Final p-score 1862 after 0 nni operations
## Final p-score 1885 after 1 nni operations
## Final p-score 1936 after 0 nni operations
## Final p-score 1881 after 0 nni operations
## Final p-score 1915 after 0 nni operations
## Final p-score 1910 after 0 nni operations
## Final p-score 1926 after 0 nni operations
## Final p-score 1895 after 1 nni operations
## Final p-score 2069 after 0 nni operations
## Final p-score 1906 after 1 nni operations
plotBS(midpoint(fitJC$tree), bs, p = 50, type="p")
When you are done, you can save your phylogeny
write.tree(bs, file="bootstrap_example.tre")