# calculations
library(geomorph) # GPA
library(shapes) # OPA
library(Morpho) # CVA
library(phangorn) # UPGMA
library(geosphere) # distm
# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(mgcViz) # GAM visualization
library(viridis) # plot scale
library(rnaturalearth) # maps
AT - Austria
ES - Spain
GR - Greece
HR - Croatia
HU -
Hungary
MD - Moldova
ME - Montenegro
PL - Poland
PT -
Portugal
RO - Romania
RS - Serbia
SI - Slovenia
TR -
Turkey
Read raw coordinates of 19 landmarks from Zenodo (Oleksa et al., 2022)
wings <- read.csv("https://zenodo.org/record/7244070/files/EU-raw-coordinates.csv", header = TRUE)
# extract sample classifier
wings <- tidyr::separate(
data = wings,
col = file,
sep = c(7), # sample name is in the first 7 characters of file name
into = c("sample", NA),
remove = FALSE
)
# extract country classifier
wings <- tidyr::separate(
data = wings,
col = sample,
sep = c(2), # country code is in the first 2 characters of sample name
into = c("country", NA),
remove = FALSE
)
head(wings, 2)
## file sample country x1 y1 x2 y2 x3 y3 x4 y4
## 1 AT-0001-031-003678-L.dw.png AT-0001 AT 219 191 238 190 292 270 287 216
## 2 AT-0001-031-003678-R.dw.png AT-0001 AT 213 190 234 189 289 268 281 211
## x5 y5 x6 y6 x7 y7 x8 y8 x9 y9 x10 y10 x11 y11 x12 y12 x13 y13 x14
## 1 294 131 359 276 409 307 387 284 431 251 401 228 450 197 456 153 471 121 487
## 2 289 129 359 274 408 304 388 281 434 249 404 226 447 194 452 151 467 119 482
## y14 x15 y15 x16 y16 x17 y17 x18 y18 x19 y19
## 1 299 525 263 604 224 642 222 651 193 91 259
## 2 297 521 259 601 223 640 222 648 193 89 259
Read latitude and longitude.
geo.data <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv", header = TRUE)
sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample), FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude", V2 = "longitude"))
# extract country classifier
sample.geo.data <- tidyr::separate(
data = sample.geo.data,
col = sample,
sep = c(2), # country code is in the first 2 characters of sample name
into = c("country", NA),
remove = FALSE
)
head(sample.geo.data, 2)
## sample country latitude longitude
## 1 AT-0001 AT 47.0038 15.39831
## 2 AT-0002 AT 47.0038 15.39831
world <- ne_countries(scale = "medium", returnclass = "sf")
# jitter the coordinates to show all locations
jitter.x <- jitter(sample.geo.data$longitude, amount = 0.2)
jitter.y <- jitter(sample.geo.data$latitude, amount = 0.2)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.geo.data,
aes(x = jitter.x, y = jitter.y, colour = country, shape = country), size = 1) +
scale_color_manual(name ="Country", values = rainbow(13)) +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
coord_sf(xlim = c(-11, 33), ylim = c(34, 56)) +
xlab("Longitude") + ylab("Latitude")
In order to avoid mistakes it is safer to use columns names instead of indexes
p <- 19 # number of landmarks
k <- 2 # number of dimensions, in this case 2 for coordinates (x, y)
# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
xyNames <- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
}
xyNames
## [1] "x1" "y1" "x2" "y2" "x3" "y3" "x4" "y4" "x5" "y5" "x6" "y6"
## [13] "x7" "y7" "x8" "y8" "x9" "y9" "x10" "y10" "x11" "y11" "x12" "y12"
## [25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
## [37] "x19" "y19"
# create coordinates names used by geomorph
xy.Names <- c("1.X", "1.Y")
for (i in 2:p) {
xy.Names <- c(xy.Names, paste(i, "X", sep = "."))
xy.Names <- c(xy.Names, paste(i, "Y", sep = "."))
}
xy.Names
## [1] "1.X" "1.Y" "2.X" "2.Y" "3.X" "3.Y" "4.X" "4.Y" "5.X" "5.Y"
## [11] "6.X" "6.Y" "7.X" "7.Y" "8.X" "8.Y" "9.X" "9.Y" "10.X" "10.Y"
## [21] "11.X" "11.Y" "12.X" "12.Y" "13.X" "13.Y" "14.X" "14.Y" "15.X" "15.Y"
## [31] "16.X" "16.Y" "17.X" "17.Y" "18.X" "18.Y" "19.X" "19.Y"
# The number of principal components used is 2*p-4 = 34, which is equal to the degrees of freedom
# create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2*p-4))
pcNames
## [1] "PC1" "PC2" "PC3" "PC4" "PC5" "PC6" "PC7" "PC8" "PC9" "PC10"
## [11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
## [21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
## [31] "PC31" "PC32" "PC33" "PC34"
# create principal components names used by geomorph
compNames <- paste0("Comp", 1:(2*p-4))
compNames
## [1] "Comp1" "Comp2" "Comp3" "Comp4" "Comp5" "Comp6" "Comp7" "Comp8"
## [9] "Comp9" "Comp10" "Comp11" "Comp12" "Comp13" "Comp14" "Comp15" "Comp16"
## [17] "Comp17" "Comp18" "Comp19" "Comp20" "Comp21" "Comp22" "Comp23" "Comp24"
## [25] "Comp25" "Comp26" "Comp27" "Comp28" "Comp29" "Comp30" "Comp31" "Comp32"
## [33] "Comp33" "Comp34"
geoNames <- c("latitude", "longitude")
Before analysis all landmark configurations have to be superimposed.
# Convert 2D array into a 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wings.coords, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.aligned <- two.d.array(GPA$coords)
head(wings.aligned, 2)
## 1.X 1.Y 2.X 2.Y
## AT-0001-031-003678-L.dw.png -0.2789939 -0.05498483 -0.2503918 -0.05609707
## AT-0001-031-003678-R.dw.png -0.2837028 -0.05412563 -0.2521392 -0.05513094
## 3.X 3.Y 4.X 4.Y
## AT-0001-031-003678-L.dw.png -0.1708259 0.06535370 -0.1772270 -0.01597587
## AT-0001-031-003678-R.dw.png -0.1714110 0.06482066 -0.1820723 -0.02097479
## 5.X 5.Y 6.X 6.Y
## AT-0001-031-003678-L.dw.png -0.1649395 -0.1436844 -0.07016610 0.07576223
## AT-0001-031-003678-R.dw.png -0.1681155 -0.1439400 -0.06642008 0.07548815
## 7.X 7.Y 8.X 8.Y
## AT-0001-031-003678-L.dw.png 0.004405225 0.1234260 -0.02820789 0.08837556
## AT-0001-031-003678-R.dw.png 0.006464705 0.1217058 -0.02302708 0.08668925
## 9.X 9.Y 10.X 10.Y
## AT-0001-031-003678-L.dw.png 0.03865483 0.03964703 -0.005994006 0.004432599
## AT-0001-031-003678-R.dw.png 0.04681374 0.03971740 0.002302572 0.004465114
## 11.X 11.Y 12.X 12.Y
## AT-0001-031-003678-L.dw.png 0.06834561 -0.04118537 0.07827983 -0.1072430
## AT-0001-031-003678-R.dw.png 0.06764016 -0.04257738 0.07616792 -0.1070389
## 13.X 13.Y 14.X 14.Y 15.X
## AT-0001-031-003678-L.dw.png 0.10150398 -0.1550677 0.1218870 0.1130044 0.1797898
## AT-0001-031-003678-R.dw.png 0.09945379 -0.1547447 0.1177652 0.1129438 0.1772376
## 15.Y 16.X 16.Y 17.X
## AT-0001-031-003678-L.dw.png 0.05963885 0.2994255 0.002608403 0.3566238
## AT-0001-031-003678-R.dw.png 0.05679621 0.2982396 0.004620497 0.3568349
## 17.Y 18.X 18.Y 19.X
## AT-0001-031-003678-L.dw.png 0.0003842025 0.3707599 -0.04305200 -0.4729295
## AT-0001-031-003678-R.dw.png 0.0040404527 0.3695364 -0.03932548 -0.4715685
## 19.Y
## AT-0001-031-003678-L.dw.png 0.04465730
## AT-0001-031-003678-R.dw.png 0.04657064
sample.aligned <- aggregate(wings.aligned, by = list(wings$sample), FUN = mean)
names(sample.aligned)[names(sample.aligned) == "Group.1"] <- "sample" # rename column
# extract country code as classifier
sample.aligned <- tidyr::separate(
data = sample.aligned,
col = sample,
sep = c(2),
into = c("country", NA),
remove = FALSE
)
head(sample.aligned, 2)
## sample country 1.X 1.Y 2.X 2.Y 3.X
## 1 AT-0001 AT -0.2844528 -0.05838585 -0.2540030 -0.06055061 -0.1791217
## 2 AT-0002 AT -0.2841005 -0.05812160 -0.2560924 -0.06061664 -0.1754031
## 3.Y 4.X 4.Y 5.X 5.Y 6.X
## 1 0.06483904 -0.1812782 -0.01908254 -0.1660425 -0.1442186 -0.07292371
## 2 0.06517893 -0.1838003 -0.01870428 -0.1677755 -0.1437020 -0.07313915
## 6.Y 7.X 7.Y 8.X 8.Y 9.X 9.Y
## 1 0.07500076 0.011911497 0.1244321 -0.01547428 0.09046047 0.04583909 0.04084050
## 2 0.07616091 0.009677258 0.1229683 -0.01582622 0.09223084 0.04708244 0.04139449
## 10.X 10.Y 11.X 11.Y 12.X 12.Y
## 1 0.001339073 0.006503687 0.06921933 -0.04011336 0.07974063 -0.1025214
## 2 0.001699646 0.006111097 0.07075491 -0.03934851 0.07961461 -0.1036169
## 13.X 13.Y 14.X 14.Y 15.X 15.Y 16.X
## 1 0.1036491 -0.1503903 0.1187769 0.1142739 0.179305 0.05982758 0.2972238
## 2 0.1001694 -0.1478836 0.1202677 0.1129797 0.179610 0.06035730 0.3033193
## 16.Y 17.X 17.Y 18.X 18.Y 19.X
## 1 0.0017221934 0.3518606 -0.002184519 0.3653600 -0.04534943 -0.4709287
## 2 0.0007875057 0.3524397 -0.002419179 0.3614544 -0.04498977 -0.4699519
## 19.Y
## 1 0.04489640
## 2 0.04123325
# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample
sample.pca <- gm.prcomp(sample.3D)
sample.pca.scores <- as.data.frame(sample.pca$x[ , compNames])
sample.pca.scores <- cbind(sample.geo.data, sample.pca.scores)
# create plot labels
variance.tab <- summary(sample.pca)$PC.summary
##
## Ordination type: Principal Component Analysis
## Centering by OLS mean
## Orthogonal projection of OLS residuals
## Number of observations: 1725
## Number of vectors 34
##
## Importance of Components:
## Comp1 Comp2 Comp3 Comp4
## Eigenvalues 0.0001959343 3.293466e-05 2.336774e-05 0.0000168889
## Proportion of Variance 0.5120436334 8.606959e-02 6.106792e-02 0.0441364999
## Cumulative Proportion 0.5120436334 5.981132e-01 6.591811e-01 0.7033176414
## Comp5 Comp6 Comp7 Comp8
## Eigenvalues 1.453208e-05 1.257551e-05 1.053209e-05 9.666578e-06
## Proportion of Variance 3.797732e-02 3.286413e-02 2.752395e-02 2.526209e-02
## Cumulative Proportion 7.412950e-01 7.741591e-01 8.016830e-01 8.269451e-01
## Comp9 Comp10 Comp11 Comp12
## Eigenvalues 8.410956e-06 7.351429e-06 7.155121e-06 6.222810e-06
## Proportion of Variance 2.198072e-02 1.921181e-02 1.869879e-02 1.626234e-02
## Cumulative Proportion 8.489259e-01 8.681377e-01 8.868364e-01 9.030988e-01
## Comp13 Comp14 Comp15 Comp16
## Eigenvalues 5.628068e-06 4.604464e-06 3.768274e-06 2.899272e-06
## Proportion of Variance 1.470807e-02 1.203305e-02 9.847794e-03 7.576794e-03
## Cumulative Proportion 9.178069e-01 9.298399e-01 9.396877e-01 9.472645e-01
## Comp17 Comp18 Comp19 Comp20
## Eigenvalues 2.505382e-06 2.253050e-06 2.138021e-06 1.835601e-06
## Proportion of Variance 6.547424e-03 5.887994e-03 5.587382e-03 4.797056e-03
## Cumulative Proportion 9.538119e-01 9.596999e-01 9.652873e-01 9.700844e-01
## Comp21 Comp22 Comp23 Comp24
## Eigenvalues 1.671666e-06 1.485871e-06 1.303561e-06 1.196571e-06
## Proportion of Variance 4.368638e-03 3.883092e-03 3.406652e-03 3.127051e-03
## Cumulative Proportion 9.744530e-01 9.783361e-01 9.817427e-01 9.848698e-01
## Comp25 Comp26 Comp27 Comp28
## Eigenvalues 1.091115e-06 7.915753e-07 7.513697e-07 7.325161e-07
## Proportion of Variance 2.851457e-03 2.068658e-03 1.963587e-03 1.914316e-03
## Cumulative Proportion 9.877212e-01 9.897899e-01 9.917535e-01 9.936678e-01
## Comp29 Comp30 Comp31 Comp32
## Eigenvalues 6.351800e-07 5.476202e-07 4.278173e-07 3.188203e-07
## Proportion of Variance 1.659943e-03 1.431120e-03 1.118033e-03 8.331869e-04
## Cumulative Proportion 9.953277e-01 9.967589e-01 9.978769e-01 9.987101e-01
## Comp33 Comp34
## Eigenvalues 2.719570e-07 2.216305e-07
## Proportion of Variance 7.107170e-04 5.791966e-04
## Cumulative Proportion 9.994208e-01 1.000000e+00
variance <- variance.tab["Proportion of Variance", "Comp1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "Comp2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
ggplot(sample.pca.scores, aes(x = Comp1, y = Comp2, shape = sample.aligned$country, color = sample.aligned$country)) +
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse() +
xlab(label.x) + ylab(label.y)
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14, 15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13, 18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)
# mark minimum blue and maximum red
GP1 <- gridPar(pt.bg = "blue", link.col = "blue", pt.size = 1,
tar.pt.bg ="red", tar.link.col ="red")
# wing shape change along PC1
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp1$min,
M2 = sample.pca$shapes$shapes.comp1$max,
gridPars=GP1, method = "points", links = links.apis)
# wing shape change along PC2
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp2$min,
M2 = sample.pca$shapes$shapes.comp2$max,
gridPars=GP1, method = "points", links = links.apis)
# countries difer significantly in wing shape
country.MANOVA <- manova(as.matrix(sample.pca.scores[ , compNames]) ~ country, data=sample.pca.scores)
summary(country.MANOVA)
## Df Pillai approx F num Df den Df Pr(>F)
## country 12 3.7005 22.162 408 20280 < 2.2e-16 ***
## Residuals 1712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PC1
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.pca.scores,
aes(x = jitter.x, y = jitter.y, colour = Comp1), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "PC1") +
xlab("Longitude") + ylab("Latitude")
GAM <- mgcv::gam(Comp1 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Comp1 ~ s(longitude, latitude)
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(longitude,latitude) 26.18 28.43 720.7 <2e-16
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", name = "PC1", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
# PC2
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.pca.scores,
aes(x = jitter.x, y = jitter.y, colour = Comp2), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "PC2") +
xlab("Longitude") + ylab("Latitude")
GAM <- mgcv::gam(Comp2 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Comp2 ~ s(longitude, latitude)
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(longitude,latitude) 27.86 28.90 58.47 <2e-16
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
l_fitRaster() + scale_fill_continuous(na.value = "transparent") +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", name = "PC2", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample
# use equal prior probability for all groups
n.country <- length(unique(sample.aligned$country)) # number of groups
sample.cva <- CVA(sample.3D, sample.aligned$country, rounds = 10000, cv = TRUE,
prior = rep(1/n.country, n.country))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
sample.cva.scores <- as.data.frame(sample.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.cva.scores) <- gsub(" ", "", colnames(sample.cva.scores))
sample.cva.scores <- cbind(sample.geo.data, sample.cva.scores)
ggplot(sample.cva.scores, aes(x = CV1, y = CV2, shape = country, color = country)) +
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse()
Classification of the samples to countries.
CVA.class <- typprobClass(sample.cva$CVscores, groups = as.factor(sample.pca.scores$country), outlier = 0)
print(CVA.class)
## cross-validated classification results in frequencies
##
## AT ES GR HR HU MD ME PL PT RO RS SI TR
## AT 10 0 0 0 0 0 0 0 0 0 0 0 0
## ES 0 414 0 0 0 0 0 0 102 0 0 0 0
## GR 0 0 239 1 0 0 0 0 0 2 1 0 1
## HR 0 0 1 127 7 0 0 0 0 1 1 23 0
## HU 0 0 0 0 17 0 0 0 0 5 0 0 0
## MD 0 0 0 0 0 9 0 0 0 0 1 0 0
## ME 0 0 0 0 0 0 17 0 0 0 3 0 0
## PL 0 0 0 6 11 0 0 231 0 0 0 3 2
## PT 0 39 0 0 0 0 0 0 153 0 0 0 0
## RO 0 0 1 3 2 3 0 1 0 180 0 7 0
## RS 0 0 0 0 0 0 5 0 0 0 15 0 0
## SI 0 0 0 1 3 0 0 0 0 1 0 16 0
## TR 0 0 0 0 0 0 0 0 0 0 0 0 60
##
##
## cross-validated classification result in %
##
## AT ES GR HR HU MD ME
## AT 100.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## ES 0.00000 80.23256 0.00000 0.00000 0.00000 0.00000 0.00000
## GR 0.00000 0.00000 97.95082 0.40984 0.00000 0.00000 0.00000
## HR 0.00000 0.00000 0.62500 79.37500 4.37500 0.00000 0.00000
## HU 0.00000 0.00000 0.00000 0.00000 77.27273 0.00000 0.00000
## MD 0.00000 0.00000 0.00000 0.00000 0.00000 90.00000 0.00000
## ME 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 85.00000
## PL 0.00000 0.00000 0.00000 2.37154 4.34783 0.00000 0.00000
## PT 0.00000 20.31250 0.00000 0.00000 0.00000 0.00000 0.00000
## RO 0.00000 0.00000 0.50761 1.52284 1.01523 1.52284 0.00000
## RS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 25.00000
## SI 0.00000 0.00000 0.00000 4.76190 14.28571 0.00000 0.00000
## TR 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
##
## PL PT RO RS SI TR
## AT 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## ES 0.00000 19.76744 0.00000 0.00000 0.00000 0.00000
## GR 0.00000 0.00000 0.81967 0.40984 0.00000 0.40984
## HR 0.00000 0.00000 0.62500 0.62500 14.37500 0.00000
## HU 0.00000 0.00000 22.72727 0.00000 0.00000 0.00000
## MD 0.00000 0.00000 0.00000 10.00000 0.00000 0.00000
## ME 0.00000 0.00000 0.00000 15.00000 0.00000 0.00000
## PL 91.30435 0.00000 0.00000 0.00000 1.18577 0.79051
## PT 0.00000 79.68750 0.00000 0.00000 0.00000 0.00000
## RO 0.50761 0.00000 91.37056 0.00000 3.55330 0.00000
## RS 0.00000 0.00000 0.00000 75.00000 0.00000 0.00000
## SI 0.00000 0.00000 4.76190 0.00000 76.19048 0.00000
## TR 0.00000 0.00000 0.00000 0.00000 0.00000 100.00000
##
##
## overall classification accuracy: 86.26087 %
##
## Kappa statistic: 0.83708
The map shows only incorrectly classifies samples. Color and shape of the points indicate country to which a sample was classified. The classification with cross-validation
sample.class <- data.frame(sample = sample.geo.data$sample,
country = sample.geo.data$country,
latitude = sample.geo.data$latitude,
longitude = sample.geo.data$longitude,
classified.as = CVA.class$groupaffinCV)
# add column indicating incorrect classification
sample.class$error <- ifelse(sample.class$country == sample.class$classified.as, "OK", "error")
# only the incorrectly classified samples
sample.error <- sample.class[sample.class$error == "error",
c("sample", "country", "latitude", "longitude","classified.as")]
# incorrect countries classification map
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.error,
aes(x = longitude, y = latitude,
colour = classified.as,
shape = classified.as), size = 2) +
scale_color_manual(name ="Classified as", values = rainbow(13)) +
scale_shape_manual(name ="Classified as", values = c(0:2, 15:25)) +
coord_sf(xlim = c(-11, 33), ylim = c(34, 56), expand = FALSE) +
labs(x = "Longitude", y = "Latitude")
# Mahalanobis Distnces between countries
dist <- sample.cva$Dist
MD.dist <- as.matrix(dist$GroupdistMaha)
MD.dist
## AT ES GR HR HU MD ME
## AT 0.000000 10.360206 9.670937 7.467583 6.369374 7.460003 6.928884
## ES 10.360206 0.000000 11.311107 11.134169 10.974498 10.517820 10.505608
## GR 9.670937 11.311107 0.000000 5.249058 6.638746 6.750555 6.415824
## HR 7.467583 11.134169 5.249058 0.000000 2.954997 4.627411 5.274550
## HU 6.369374 10.974498 6.638746 2.954997 0.000000 5.158678 5.792680
## MD 7.460003 10.517820 6.750555 4.627411 5.158678 0.000000 5.853972
## ME 6.928884 10.505608 6.415824 5.274550 5.792680 5.853972 0.000000
## PL 6.986357 9.240762 6.253255 3.910777 3.909524 4.650527 6.207472
## PT 10.271098 1.891779 11.483281 11.224857 10.975430 10.376974 10.428489
## RO 6.367227 10.598027 5.850517 3.767795 3.801100 3.607623 4.882649
## RS 7.079043 10.793296 6.181560 4.718061 5.363764 5.233406 1.871993
## SI 8.096234 11.474258 5.994399 2.451566 3.592021 4.634304 6.506716
## TR 7.759354 10.203552 4.976778 4.316541 5.285749 5.774989 6.026874
## PL PT RO RS SI TR
## AT 6.986357 10.271098 6.367227 7.079043 8.096234 7.759354
## ES 9.240762 1.891779 10.598027 10.793296 11.474258 10.203552
## GR 6.253255 11.483281 5.850517 6.181560 5.994399 4.976778
## HR 3.910777 11.224857 3.767795 4.718061 2.451566 4.316541
## HU 3.909524 10.975430 3.801100 5.363764 3.592021 5.285749
## MD 4.650527 10.376974 3.607623 5.233406 4.634304 5.774989
## ME 6.207472 10.428489 4.882649 1.871993 6.506716 6.026874
## PL 0.000000 9.192864 4.428774 5.812371 4.148400 5.165371
## PT 9.192864 0.000000 10.564756 10.711068 11.439740 10.371718
## RO 4.428774 10.564756 0.000000 4.582683 4.244022 4.349041
## RS 5.812371 10.711068 4.582683 0.000000 5.886266 6.044034
## SI 4.148400 11.439740 4.244022 5.886266 0.000000 5.170372
## TR 5.165371 10.371718 4.349041 6.044034 5.170372 0.000000
# Significance of differences betwen countries
MD.prob <- as.matrix(dist$probsMaha)
MD.prob
## AT ES GR HR HU MD ME PL PT RO RS
## AT 0.0000 1e-04 1e-04 0.0001 0.0022 0.0024 0.0009 0.0001 1e-04 0.0002 0.0005
## ES 0.0001 0e+00 1e-04 0.0001 0.0001 0.0001 0.0001 0.0001 1e-04 0.0001 0.0001
## GR 0.0001 1e-04 0e+00 0.0001 0.0001 0.0001 0.0001 0.0001 1e-04 0.0001 0.0001
## HR 0.0001 1e-04 1e-04 0.0000 0.0243 0.0121 0.0001 0.0001 1e-04 0.0001 0.0001
## HU 0.0022 1e-04 1e-04 0.0243 0.0000 0.0169 0.0003 0.0009 1e-04 0.0019 0.0014
## MD 0.0024 1e-04 1e-04 0.0121 0.0169 0.0000 0.0065 0.0106 1e-04 0.0727 0.0167
## ME 0.0009 1e-04 1e-04 0.0001 0.0003 0.0065 0.0000 0.0001 1e-04 0.0001 0.8722
## PL 0.0001 1e-04 1e-04 0.0001 0.0009 0.0106 0.0001 0.0000 1e-04 0.0001 0.0001
## PT 0.0001 1e-04 1e-04 0.0001 0.0001 0.0001 0.0001 0.0001 0e+00 0.0001 0.0001
## RO 0.0002 1e-04 1e-04 0.0001 0.0019 0.0727 0.0001 0.0001 1e-04 0.0000 0.0002
## RS 0.0005 1e-04 1e-04 0.0001 0.0014 0.0167 0.8722 0.0001 1e-04 0.0002 0.0000
## SI 0.0001 1e-04 1e-04 0.0951 0.0526 0.0430 0.0001 0.0004 1e-04 0.0004 0.0005
## TR 0.0001 1e-04 1e-04 0.0001 0.0001 0.0012 0.0001 0.0001 1e-04 0.0001 0.0001
## SI TR
## AT 0.0001 0.0001
## ES 0.0001 0.0001
## GR 0.0001 0.0001
## HR 0.0951 0.0001
## HU 0.0526 0.0001
## MD 0.0430 0.0012
## ME 0.0001 0.0001
## PL 0.0004 0.0001
## PT 0.0001 0.0001
## RO 0.0004 0.0001
## RS 0.0005 0.0001
## SI 0.0000 0.0001
## TR 0.0001 0.0000
# UPGMA tree
tree.upgma <- upgma(MD.dist)
plot(tree.upgma, label.offset = 0.1, cex = 0.5)
There is significant correlation between geographical distances and Mahalanobis distances between countries
# calculate geographical distance between countries
country.geo.data <- aggregate(cbind(sample.geo.data$latitude, sample.geo.data$longitude),
by = list(sample.geo.data$country), FUN = mean)
country.geo.data <- reshape::rename(country.geo.data,
c(Group.1 = "country", V1 = "latitude", V2 = "longitude"))
geo.dist <- distm(country.geo.data[c("longitude","latitude")], fun = distGeo)
row.names(geo.dist) <- country.geo.data$country
colnames(geo.dist) <- country.geo.data$country
geo.dist <- geo.dist / 1000 # convert meters to kilometers
geo.dist
## AT ES GR HR HU MD ME
## AT 0.0000 1677.4948 1118.3415 219.0430 247.3659 970.9450 582.7968
## ES 1677.4948 0.0000 2280.7843 1714.9711 1922.1692 2627.6241 1897.1536
## GR 1118.3415 2280.7843 0.0000 905.0233 1029.1889 1003.6747 539.5506
## HR 219.0430 1714.9711 905.0233 0.0000 283.7703 915.0564 366.4523
## HU 247.3659 1922.1692 1029.1889 283.7703 0.0000 725.2592 539.1022
## MD 970.9450 2627.6241 1003.6747 915.0564 725.2592 0.0000 863.2765
## ME 582.7968 1897.1536 539.5506 366.4523 539.1022 863.2765 0.0000
## PL 684.1116 2269.8425 1472.0395 813.1159 535.6565 745.8927 1050.5456
## PT 2115.2567 480.9975 2761.7533 2172.5472 2362.3395 3078.4628 2372.3072
## RO 740.5166 2364.5410 801.3884 650.0248 509.1914 284.0491 581.5350
## RS 594.4712 2076.2853 577.0488 416.3741 457.5673 640.8441 222.4330
## SI 109.1522 1589.4437 1091.2539 191.8487 334.4072 1044.4924 551.7388
## TR 1098.8973 2543.9486 457.4149 929.7696 919.7317 601.6676 649.3431
## PL PT RO RS SI TR
## AT 684.1116 2115.2567 740.5166 594.4712 109.1522 1098.8973
## ES 2269.8425 480.9975 2364.5410 2076.2853 1589.4437 2543.9486
## GR 1472.0395 2761.7533 801.3884 577.0488 1091.2539 457.4149
## HR 813.1159 2172.5472 650.0248 416.3741 191.8487 929.7696
## HU 535.6565 2362.3395 509.1914 457.5673 334.4072 919.7317
## MD 745.8927 3078.4628 284.0491 640.8441 1044.4924 601.6676
## ME 1050.5456 2372.3072 581.5350 222.4330 551.7388 649.3431
## PL 0.0000 2665.4718 727.4901 906.2962 793.2635 1222.9193
## PT 2665.4718 0.0000 2822.2746 2545.8292 2034.4694 3020.5177
## RO 727.4901 2822.2746 0.0000 359.6086 798.7755 495.4293
## RS 906.2962 2545.8292 359.6086 0.0000 604.4626 513.3960
## SI 793.2635 2034.4694 798.7755 604.4626 0.0000 1117.1129
## TR 1222.9193 3020.5177 495.4293 513.3960 1117.1129 0.0000
vegan::mantel(geo.dist, MD.dist, permutations = 9999, method = "spearman")
##
## Mantel statistic based on Spearman's rank correlation rho
##
## Call:
## vegan::mantel(xdis = geo.dist, ydis = MD.dist, method = "spearman", permutations = 9999)
##
## Mantel statistic r: 0.7046
## Significance: 0.0015
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.228 0.312 0.398 0.574
## Permutation: free
## Number of permutations: 9999
# convert distance table to get distance in one column
MD.distance <- as.dist(MD.dist)
MD.distance <- as.numeric(MD.distance)
MD.distance <- data.frame(t(combn(rownames(MD.dist),2)), MD.distance)
geo.distance <- as.dist(geo.dist)
geo.distance <- as.numeric(geo.distance)
geo.distance <- data.frame(t(combn(rownames(geo.dist),2)), geo.distance)
geo.distance$MD.distance <- MD.distance$MD.distance
geo.distance$dist.name <- paste0(geo.distance$X1, "-", geo.distance$X2)
outliers <- subset(geo.distance, dist.name %in% c("AT-HU","AT-HR", "AT-SI"))
ggplot(geo.distance, aes(x = geo.distance, y = MD.distance)) +
geom_point() +
coord_trans(x = "log10") +
geom_text( data = outliers, aes(x = geo.distance , y = MD.distance, label = dist.name), hjust =-0.2) +
xlab("Geographical distance (km)") + ylab("Mahalanobis distance between wing shape")
# add column with regions consisting of well represented countries or pairs of neighboring countries
sample.aligned$region <- ifelse(sample.aligned$country == "ES"
| sample.class$country == "PT"
, "ES-PT", sample.aligned$country)
sample.aligned$region <- ifelse(sample.aligned$country == "RO"
| sample.class$country == "MD"
, "RO-MD", sample.aligned$region)
sample.aligned$region <- ifelse(sample.aligned$country == "HR"
| sample.class$country == "SI"
, "HR-SI", sample.aligned$region)
# exclude countries with sample size smaller than 25
sample.region <- sample.aligned[sample.aligned$country != "AT"
& sample.aligned$country != "HU"
& sample.aligned$country != "ME"
& sample.aligned$country != "RS", ]
sample.3D <- arrayspecs(sample.region[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.region$sample
# use equal prior probability for all groups
n.region <- length(unique(sample.region$region)) # number of groups
region.cva <- CVA(sample.3D, sample.region$region, rounds = 10000, cv = TRUE,
prior = rep(1/n.region, n.region))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
region.cva.scores <- as.data.frame(region.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(region.cva.scores) <- gsub(" ", "", colnames(region.cva.scores))
ggplot(region.cva.scores, aes(x = CV1, y = CV2, shape = sample.region$region, color = sample.region$region)) +
geom_point() +
scale_shape_manual(name ="Region", values = c(0:2,19,5:6)) +
scale_color_manual(name ="Region", values = rainbow(6)) +
stat_ellipse()
CVA.class.region <- typprobClass(region.cva$CVscores, groups = as.factor(sample.region$region), outlier = 0)
print(CVA.class.region)
## cross-validated classification results in frequencies
##
## ES-PT GR HR-SI PL RO-MD TR
## ES-PT 708 0 0 0 0 0
## GR 0 240 1 0 2 1
## HR-SI 0 1 179 0 1 0
## PL 0 0 8 241 2 2
## RO-MD 0 1 7 1 197 1
## TR 0 0 0 0 0 60
##
##
## cross-validated classification result in %
##
## ES-PT GR HR-SI PL RO-MD TR
## ES-PT 100.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## GR 0.00000 98.36066 0.40984 0.00000 0.81967 0.40984
## HR-SI 0.00000 0.55249 98.89503 0.00000 0.55249 0.00000
## PL 0.00000 0.00000 3.16206 95.25692 0.79051 0.79051
## RO-MD 0.00000 0.48309 3.38164 0.48309 95.16908 0.48309
## TR 0.00000 0.00000 0.00000 0.00000 0.00000 100.00000
##
##
## overall classification accuracy: 98.30611 %
##
## Kappa statistic: 0.9772
wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv", header = TRUE)
# extract sample classifier
wings.lin <- tidyr::separate(
data = wings.lin,
col = file,
sep = c(10), # sample name is in the first 10 characters of file name
into = c("sample", NA),
remove = FALSE
)
# extract lineage classifier
wings.lin <- tidyr::separate(
data = wings.lin,
col = file,
sep = c(1), # lineage code is in the first character of file name
into = c("lineage", NA),
remove = FALSE
)
head(wings.lin, 2)
## file lineage sample x1 y1 x2 y2 x3 y3 x4 y4 x5
## 1 A-ada-0698-01.dw.png A A-ada-0698 191 229 207 227 239 276 239 240 238
## 2 A-ada-0698-02.dw.png A A-ada-0698 166 243 182 241 212 295 216 258 218
## y5 x6 y6 x7 y7 x8 y8 x9 y9 x10 y10 x11 y11 x12 y12 x13 y13 x14 y14
## 1 185 295 272 331 288 319 274 341 249 325 240 344 214 342 188 348 165 372 276
## 2 204 270 294 303 312 291 297 318 274 301 263 323 240 326 214 336 192 346 302
## x15 y15 x16 y16 x17 y17 x18 y18 x19 y19
## 1 392 249 438 219 460 214 461 191 118 289
## 2 369 276 414 249 441 245 446 224 91 295
Before analysis all landmark configurations have to be superimposed.
# Convert 2D array into a 3D array
wings.lin.3D <- arrayspecs(wings.lin[xyNames], p, k)
dimnames(wings.lin.3D)[[3]] <- wings.lin$file
# Align the coordinates using Generalized Procrustes Analysis
GPA.lin <- gpagen(wings.lin.3D, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.lin.aligned <- two.d.array(GPA.lin$coords)
head(wings.lin.aligned, 2)
## 1.X 1.Y 2.X 2.Y 3.X
## A-ada-0698-01.dw.png -0.2882258 -0.06434325 -0.2501565 -0.06286477 -0.1943604
## A-ada-0698-02.dw.png -0.2923716 -0.06310753 -0.2547431 -0.06445348 -0.1961197
## 3.Y 4.X 4.Y 5.X 5.Y
## A-ada-0698-01.dw.png 0.06365687 -0.1805428 -0.02027358 -0.1617611 -0.1488831
## A-ada-0698-02.dw.png 0.06734610 -0.1791741 -0.01788917 -0.1633642 -0.1430801
## 6.X 6.Y 7.X 7.Y 8.X
## A-ada-0698-01.dw.png -0.06227036 0.07582712 0.01551940 0.1269482 -0.007085048
## A-ada-0698-02.dw.png -0.06100973 0.07700318 0.01203246 0.1256895 -0.012781515
## 8.Y 9.X 9.Y 10.X 10.Y
## A-ada-0698-01.dw.png 0.08970314 0.05380538 0.03986298 0.01995414 0.01273725
## A-ada-0698-02.dw.png 0.08832114 0.05477287 0.04040039 0.01750110 0.01130163
## 11.X 11.Y 12.X 12.Y 13.X
## A-ada-0698-01.dw.png 0.07423407 -0.04058406 0.07955375 -0.10196906 0.1023722
## A-ada-0698-02.dw.png 0.07342713 -0.03765121 0.08577614 -0.09750888 0.1135807
## 13.Y 14.X 14.Y 15.X 15.Y
## A-ada-0698-01.dw.png -0.1532877 0.1157134 0.1147105 0.1727068 0.05943997
## A-ada-0698-02.dw.png -0.1466154 0.1141161 0.1113145 0.1729860 0.05559041
## 16.X 16.Y 17.X 17.Y 18.X
## A-ada-0698-01.dw.png 0.2914686 0.007154321 0.3446812 0.003942388 0.3558432
## A-ada-0698-02.dw.png 0.2832388 0.002084332 0.3468660 -0.001641282 0.3628345
## 18.Y 19.X 19.Y
## A-ada-0698-01.dw.png -0.04929392 -0.4814499 0.04751676
## A-ada-0698-02.dw.png -0.04945431 -0.4775679 0.04235017
sample.lin.aligned <- aggregate(wings.lin.aligned, by = list(wings.lin$sample), FUN = mean)
names(sample.lin.aligned)[names(sample.lin.aligned) == "Group.1"] <- "sample" # rename column
# extract lineage code as classifier
sample.lin.aligned <- tidyr::separate(
data = sample.lin.aligned,
col = sample,
sep = c(1),
into = c("lineage", NA),
remove = FALSE
)
geo.data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv", header = TRUE)
sample.lin.aligned$subspecies <- geo.data.lin$subspecies
sample.lin.aligned$country <- geo.data.lin$country
head(sample.lin.aligned, 2)
## sample lineage 1.X 1.Y 2.X 2.Y 3.X
## 1 A-ada-0698 A -0.2881589 -0.06345784 -0.2516858 -0.06342460 -0.1943791
## 2 A-ada-0700 A -0.2848260 -0.05937522 -0.2504928 -0.06047929 -0.1905145
## 3.Y 4.X 4.Y 5.X 5.Y 6.X
## 1 0.06499985 -0.1814028 -0.01881681 -0.1631677 -0.1450486 -0.06355953
## 2 0.06327772 -0.1835898 -0.01881641 -0.1617977 -0.1466439 -0.07092678
## 6.Y 7.X 7.Y 8.X 8.Y 9.X 9.Y
## 1 0.07545727 0.011953966 0.1247743 -0.01179719 0.09033662 0.05553661 0.04047449
## 2 0.07393102 0.004073201 0.1246405 -0.01836104 0.09146764 0.05396549 0.03975215
## 10.X 10.Y 11.X 11.Y 12.X 12.Y
## 1 0.016877445 0.011009900 0.07311162 -0.03837618 0.08350415 -0.1002200
## 2 0.008618304 0.008977642 0.07310915 -0.04169207 0.08477211 -0.1016867
## 13.X 13.Y 14.X 14.Y 15.X 15.Y 16.X
## 1 0.1101864 -0.1501086 0.1146867 0.1127786 0.1719523 0.05786099 0.2854625
## 2 0.1126206 -0.1516995 0.1105134 0.1122708 0.1694617 0.05566828 0.2880828
## 16.Y 17.X 17.Y 18.X 18.Y 19.X
## 1 0.004669201 0.3480093 0.0003196065 0.3614398 -0.04939595 -0.4785698
## 2 0.003335192 0.3536997 0.0010182266 0.3709922 -0.04559680 -0.4694000
## 19.Y subspecies country
## 1 0.04616779 adansonii Guinea
## 2 0.05165077 adansonii Ivory Coast
# Convert 2D array into a 3D array for easier analysis of unknown samples
sample.lin.3D <- arrayspecs(sample.lin.aligned[xy.Names], p, k)
dimnames(sample.lin.3D)[[3]] <- sample.lin.aligned$sample
# use equal prior probability for all groups
n.lin <- length(unique(sample.lin.aligned$lineage)) # number of groups
sample.lin.cva <- CVA(sample.lin.3D, sample.lin.aligned$lineage, rounds = 10000, cv = TRUE,
prior = rep(1/n.lin, n.lin))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
sample.lin.cva.scores <- as.data.frame(sample.lin.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.lin.cva.scores) <- gsub(" ", "", colnames(sample.lin.cva.scores))
sample.lin.cva.scores <- cbind(sample.lin.aligned$lineage, sample.lin.cva.scores)
names(sample.lin.cva.scores)[1] <- "lineage" # rename column
rownames(sample.lin.cva.scores) <- sample.lin.aligned$sample
ggplot(sample.lin.cva.scores, aes(x = CV1, y = CV2, color = lineage)) +
geom_point() +
coord_fixed() +
stat_ellipse()
# Convert 2D array into a 3D array
sample.only <- sample.aligned[xy.Names]
unknown.wings <- arrayspecs(sample.only, p, k)
dimnames(unknown.wings)[[3]] <- sample.only$sample
# calculate covarience for each lineage
covariances <- lapply(unique(sample.lin.cva.scores$lineage),
function(x)
cov(sample.lin.cva.scores[sample.lin.cva.scores$lineage==x,-1]))
means <- aggregate(sample.lin.cva.scores[,names(sample.lin.cva.scores) != "lineage"],
list(sample.lin.cva.scores$lineage), FUN=mean)
rownames(means) <- means$Group.1
means <- means[,names(means) != "Group.1"]
# Projection of the samples to canonical variate space from Nawrocka et al. 2018
groups <- rownames(means)
result.list <- vector(mode = "list", length = nrow(sample.aligned))
CV.list <- vector(mode = "list", length = nrow(sample.aligned))
for (r in 1:nrow(sample.aligned)) {
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(GPA.lin$consensus, unknown.wings[,,r])
unknown.aligned <- unknown.OPA$Bhat
CV.row <- predict(sample.lin.cva, unknown.aligned)
# create empty list for results
result <- numeric(length(groups))
for (i in 1:length(groups)) {
MD <- mahalanobis(CV.row, unlist(means[i, ]), as.matrix(covariances[[i]]))
result[i] <- MD
}
result.list[[r]] <- result
CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab))
CV.tab <- as.data.frame(CV.tab)
CV.tab <- cbind(CV.tab, sample.aligned$country)
colnames(CV.tab) <- gsub("sample.aligned\\$", "", colnames(CV.tab))
# Black ellipses A, C, M and O indicate 95% confidence regions of reference samples from Nawrocka et al. 2018
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = country, color = country))+
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse() +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "A"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "C"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "M"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "O"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
geom_label(means,
mapping = aes(x = CV1, y = CV2, label = rownames(means)), inherit.aes = FALSE)
MD.tab <- do.call(rbind, result.list)
MD.tab <- sqrt(MD.tab)
# column names for lineages
colnames(MD.tab) <- groups
MD.tab <- as.data.frame(MD.tab)
lineage <- colnames(MD.tab)[apply(MD.tab, 1, which.min)]
# final column names
colnames(MD.tab) <- paste0("MD.",groups)
MD.tab <- cbind(MD.tab, lineage)
rownames(MD.tab) <- sample.aligned$sample
# frequencies of the lineages
table(MD.tab$lineage)
##
## A C M O
## 179 844 652 50
# frequencies of the lineage in countries
table(sample.aligned$country, MD.tab$lineage)
##
## A C M O
## AT 0 10 0 0
## ES 52 0 464 0
## GR 13 190 0 41
## HR 0 160 0 0
## HU 0 22 0 0
## MD 1 8 0 1
## ME 0 20 0 0
## PL 97 142 8 6
## PT 12 0 180 0
## RO 2 194 0 1
## RS 0 20 0 0
## SI 0 21 0 0
## TR 2 57 0 1
sample.lineages <- MD.tab
sample.lineages <- cbind(sample.lineages,
sample.geo.data$longitude, sample.geo.data$latitude)
colnames(sample.lineages) <- gsub("sample.geo.data\\$", "", colnames(sample.lineages))
# Classification of the samples to lineages according to <a href="#Nawrocka et al. 2018">Nawrocka et al. 2018</a>
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = lineage), size = 1) +
scale_color_manual(name ="Lineage", values = rainbow(4)) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
xlab("Longitude") + ylab("Latitude")
# Lineage A
# Mahalanobis Distance to lineage A (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.A), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage A", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.A ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.A <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage A", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.A
# Lineage C
# Mahalanobis Distance to lineage C (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.C), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage C", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.C ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.C <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage C", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.C
# Lineage M
# Mahalanobis Distance to lineage M (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.M), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage M", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.M ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.M <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage M", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.M
# Lineage O
# Mahalanobis Distance to lineage O (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.O), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage O", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.O ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.O <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage O", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.O
As unknown there were used European samples from lineages dataset Nawrocka et al., (2018a).
# use samples from lineages M and C as unknown
sample.unknown <- subset(sample.lin.aligned, lineage == "M" | lineage == "C")
# convert 2D array into a 3D array
unknown.lin <- arrayspecs(sample.unknown[xy.Names], p, k)
dimnames(unknown.lin)[[3]] <- sample.unknown$sample
# calculate CVA scores for unknown samples
CV.list <- vector(mode = "list", length = nrow(sample.unknown))
for (r in 1:nrow(sample.unknown)) {
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(GPA$consensus, unknown.lin[,,r])
unknown.aligned <- unknown.OPA$Bhat
CV.row <- predict(region.cva, unknown.aligned)
CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab))
CV.tab <- as.data.frame(CV.tab)
outlier.threshold <- 0.001
typprobs <- typprobClass(CV.tab, region.cva$CVscores,
groups = as.factor(sample.region$region),
outlier = outlier.threshold, sep = FALSE)
print(typprobs)
##
##
## specimens have been classified as:
##
## ES-PT GR HR-SI none PL RO-MD TR
## 3 3 17 12 11 3 4
# probability of classification for all groups
P.tab <- typprobs$probs
rownames(P.tab) <- sample.unknown$sample
# for each sample find maximum probability
P.max <- apply(P.tab, 1, FUN = max)
# for each sample find region with larges probability
region.max <- colnames(P.tab)[apply(P.tab, 1, which.max)]
# samples with probability below 0.001 are considered as outlines
P.tab.max <- data.frame(country = sample.unknown$country, P.max, region.max)
head(P.tab.max, 2)
## country P.max region.max
## C-car-0204 Austria 0.007747604 PL
## C-car-0216 Slovenia 0.016165649 HR-SI
# frequencies of the countries in regions without detection of outliers
table(P.tab.max$country, P.tab.max$region.max)
##
## ES-PT GR HR-SI PL RO-MD TR
## Austria 1 0 3 2 0 0
## Croatia 0 0 1 0 0 0
## Denmark 0 0 0 1 0 0
## England 0 0 0 3 0 0
## France 0 0 0 3 0 0
## Greece 0 3 4 0 0 4
## Hungary 0 0 1 0 1 0
## Italy 0 0 4 6 1 0
## Norway 1 0 0 3 0 0
## Romania 0 0 2 0 0 0
## Russia 1 0 0 1 0 0
## Serbia 0 0 2 0 1 0
## Slovenia 0 0 1 1 0 0
## Spain 2 0 0 0 0 0
P.tab.max$region.none <- ifelse(P.max < outlier.threshold, "none", region.max)
# frequencies of the countries in regions with detection of outliers
table(P.tab.max$country, P.tab.max$region.none)
##
## ES-PT GR HR-SI none PL RO-MD TR
## Austria 0 0 2 3 1 0 0
## Croatia 0 0 1 0 0 0 0
## Denmark 0 0 0 1 0 0 0
## England 0 0 0 1 2 0 0
## France 0 0 0 3 0 0 0
## Greece 0 3 4 0 0 0 4
## Hungary 0 0 1 0 0 1 0
## Italy 0 0 4 1 5 1 0
## Norway 1 0 0 1 2 0 0
## Romania 0 0 2 0 0 0 0
## Russia 1 0 0 1 0 0 0
## Serbia 0 0 2 0 0 1 0
## Slovenia 0 0 1 0 1 0 0
## Spain 1 0 0 1 0 0 0
# regions labels
region.cva.scores$region <- sample.region$region
region.cva.means <- aggregate(region.cva.scores[,names(region.cva.scores) != "region"],
list(region.cva.scores$region), FUN = mean)
rownames(region.cva.means) <- region.cva.means$Group.1
region.cva.means <- region.cva.means[,names(region.cva.means) != "Group.1"]
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = sample.unknown$subspecies, color = sample.unknown$subspecies)) +
geom_point() +
scale_shape_manual(name ="subspecies", values = c(0:5)) +
scale_color_manual(name ="subspecies", values = rainbow(6)) +
stat_ellipse() +
stat_ellipse(data = subset(region.cva.scores, region == "ES-PT"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "GR"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "HR-SI"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "PL"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "RO-MD"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "TR"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
geom_label(region.cva.means,
mapping = aes(x = CV1, y = CV2, label = rownames(region.cva.means)), inherit.aes = FALSE)
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336
Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (Apis mellifera) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Polish_Poland.1250 LC_CTYPE=Polish_Poland.1250
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C
## [5] LC_TIME=Polish_Poland.1250
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rnaturalearth_0.1.0 viridis_0.6.2 viridisLite_0.4.0
## [4] mgcViz_0.1.9 qgam_1.3.4 mgcv_1.8-33
## [7] nlme_3.1-149 ggforce_0.3.3 ggplot2_3.3.6
## [10] geosphere_1.5-14 phangorn_2.5.5 ape_5.4-1
## [13] Morpho_2.9 shapes_1.2.6 geomorph_4.0.4
## [16] Matrix_1.4-1 rgl_0.109.6 RRPP_1.3.0
##
## loaded via a namespace (and not attached):
## [1] matrixStats_0.62.0 sf_0.9-6 doParallel_1.0.17
## [4] RColorBrewer_1.1-3 tools_4.0.3 bslib_0.4.1
## [7] vegan_2.6-2 utf8_1.2.2 R6_2.5.1
## [10] KernSmooth_2.23-17 DBI_1.1.3 colorspace_2.0-3
## [13] permute_0.9-7 withr_2.5.0 sp_1.5-0
## [16] rnaturalearthdata_0.1.0 tidyselect_1.2.0 gridExtra_2.3
## [19] GGally_2.1.2 compiler_4.0.3 cli_3.3.0
## [22] bezier_1.1.2 isoband_0.2.5 labeling_0.4.2
## [25] sass_0.4.1 scales_1.2.1 classInt_0.4-7
## [28] quadprog_1.5-8 proxy_0.4-27 stringr_1.4.0
## [31] digest_0.6.29 minqa_1.2.4 rmarkdown_2.18
## [34] colorRamps_2.3.1 base64enc_0.1-3 jpeg_0.1-9
## [37] pkgconfig_2.0.3 htmltools_0.5.3 lme4_1.1-30
## [40] maps_3.4.0 highr_0.9 fastmap_1.1.0
## [43] htmlwidgets_1.5.4 rlang_1.0.4 rstudioapi_0.14
## [46] Rvcg_0.21 shiny_1.7.3 jquerylib_0.1.4
## [49] farver_2.1.1 generics_0.1.3 jsonlite_1.8.0
## [52] dplyr_1.0.9 magrittr_2.0.1 Rcpp_1.0.9
## [55] munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.1
## [58] scatterplot3d_0.3-42 stringi_1.7.8 yaml_2.3.5
## [61] MASS_7.3-58.1 plyr_1.8.7 grid_4.0.3
## [64] parallel_4.0.3 promises_1.2.0.1 miniUI_0.1.1.1
## [67] lattice_0.20-41 splines_4.0.3 knitr_1.40
## [70] pillar_1.8.1 igraph_1.2.6 boot_1.3-25
## [73] codetools_0.2-16 fastmatch_1.1-0 glue_1.6.2
## [76] evaluate_0.18 vctrs_0.4.1 nloptr_2.0.3
## [79] tweenr_1.0.2 httpuv_1.6.5 foreach_1.5.2
## [82] purrr_0.3.4 tidyr_1.2.0 gtable_0.3.1
## [85] polyclip_1.10-0 reshape_0.8.9 assertthat_0.2.1
## [88] cachem_1.0.6 xfun_0.31 mime_0.12
## [91] xtable_1.8-4 e1071_1.7-11 later_1.3.0
## [94] class_7.3-17 minpack.lm_1.2-2 tibble_3.1.8
## [97] iterators_1.0.14 cluster_2.1.0 units_0.8-0
## [100] gamm4_0.2-6 ellipsis_0.3.2