Chapter 4 Summary Statistics
Now that we have filtered and cleaned the data, and created the Range
and sampling Events
tables for zero-filling the data matrix, we are ready to start generating summary statistics. These summary statistics are generated annually to support Birds Canada’s reporting for Canadian PFW participants.
Let’s start this Chapter by reloading our libraries and resetting our working directories in the event you took a break and are starting a new session with your newly filtered and cleaned data set.
require(tidyverse)
require(data.table)
<- paste("Output/")
out.dir <- paste("Data/")
dat.dir
# Load full dataset
<-fread("Data/PFW_Canada_All.csv") data
Before we proceed we are going to add one more filter to the data. Specifically, there are some CATEGORY
types that are not useful for our summary purposes (e.g., hybrids = slash
& integrade
, unknown = spuh
, undescribed = form
, and domestic = domestic
).
If you want to keep all records, then skip this step.
<-data %>% filter(CATEGORY %in% c("species", "issf")) %>% droplevels() data
4.1 Average number of birds per week
Calculate the average number of individual birds per week at each station, summarized by region, province and nationally.
Note that this is not a zero-filled data matrix. I assume that all checklists have a minimum of one bird recorded. This is likely a true assumption.
# Region
<-data %>% group_by(region, Period, floor.week, loc_id) %>%
ind.week.regsummarize(loc.sum=sum(how_many))
<-ind.week.reg %>% group_by(region, Period) %>%
ind.week.regsummarize(reg.ave=mean(loc.sum))
<-cast(ind.week.reg, Period~region, value="reg.ave")
ind.week.region
write.table(format(ind.week.region, digits=4), file = paste(out.dir,"Average number of individual birds per week_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Province
<-data %>% group_by(Prov, Period, floor.week, loc_id) %>% summarize(loc.sum=sum(how_many))
ind.week.prov
<-ind.week.prov %>% group_by(Prov, Period) %>%
ind.week.provsummarize(prov.ave=mean(loc.sum))
<-cast(ind.week.prov, Period~Prov, value="prov.ave")
ind.week.province
write.table(format(ind.week.province, digits=4), file = paste(out.dir,"Average number of individual birds per week_province.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-data %>% group_by(Period, floor.week, loc_id) %>% summarize(loc.sum=sum(how_many))
ind.week.nat
<-ind.week.nat %>% group_by(Period) %>%
ind.week.natsummarize(nat.ave=mean(loc.sum))
<-as.data.frame((ind.week.nat))
ind.week.nat
write.table(format(ind.week.nat, digits=4), file = paste(out.dir,"Average number of individual birds per week_national.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
4.2 Average number of species per week
Calculate the average number of species (REPORT_AS
) per week at each station, summarized by region, province and nationally.
Note that this is not a zero-filled data matrix. I assume that all checklists have a minimum of one bird recorded. This is likely a true assumption.
# Region
<-data %>% group_by(region, Period, floor.week, loc_id) %>%
sp.week.regsummarize(loc.sp=n_distinct(REPORT_AS))
<-sp.week.reg %>% group_by(region, Period) %>%
sp.week.regsummarize(reg.ave=mean(loc.sp))
<-cast(sp.week.reg, Period~region, value="reg.ave")
sp.week.reg
write.table(format(sp.week.reg, digits=3), file = paste(out.dir,"Average number of species per week_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Province
<-data %>% group_by(Prov, Period, floor.week, loc_id) %>%
sp.week.provsummarize(loc.sp=n_distinct(REPORT_AS))
<-sp.week.prov %>% group_by(Prov, Period) %>% summarize(prov.ave=mean(loc.sp))
sp.week.prov
<-cast(sp.week.prov, Period~Prov, value="prov.ave")
sp.week.prov
write.table(format(sp.week.prov, digits=3), file = paste(out.dir,"Average number of species per week_province.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-data %>% group_by(Period, floor.week, loc_id) %>%
sp.week.natsummarize(loc.sp=n_distinct(REPORT_AS))
<-sp.week.nat %>% group_by(Period) %>% summarize(nat.ave=mean(loc.sp))
sp.week.nat
<-as.data.frame((sp.week.nat))
sp.week.nat
write.table(format(sp.week.nat, digits=3), file = paste(out.dir,"Average number of species per week_national.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
These number are inflated because there are species in the list that are not strictly feeder birds.
4.3 Average number of species per site
Calculate the average number of species (REPORT_AS
) per site in total years (all weeks combined) summarized by region, province and nationally.
Note that this is not a zero-filled data matrix. I assume that all checklists have a minimum of one bird recorded. This is likely a true assumption.
# Region
<-data %>% group_by(loc_id, Period, region) %>%
tot.spsummarise(mean.sp.site=n_distinct(REPORT_AS)) %>%
group_by(Period, region) %>%
summarise(mean.sp=mean(mean.sp.site))
<-cast(tot.sp, Period~region, value="mean.sp")
tot.sp
write.table(format(tot.sp, digits=3), file = paste(out.dir,"Average number of species per site in total years_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Province
<-data %>% group_by(loc_id, Period, Prov) %>%
tot.sp.provsummarise(mean.sp.site=n_distinct(REPORT_AS)) %>%
group_by(Period, Prov) %>%
summarise(mean.sp=mean(mean.sp.site))
<-cast(tot.sp.prov, Period~Prov, value="mean.sp")
tot.sp.prov
write.table(format(tot.sp.prov, digits=3), file = paste(out.dir,"Average number of species per site in total years_province.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-data %>% group_by(loc_id, Period) %>%
tot.sp.natsummarise(mean.sp.site=n_distinct(REPORT_AS)) %>%
group_by(Period) %>%
summarise(mean.sp=mean(mean.sp.site))
<-as.data.frame((tot.sp.nat))
tot.sp.nat
write.table(format(tot.sp.nat, digits=3), file = paste(out.dir,"Average number of species per site in total years_national.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
These numbers may be slightly inflated because there are species in the list that are not strictly feeder birds.
4.4 Average group size when the species is present
This summary statistic captures the geometric mean group size of species for sites and year when it is present. It is therefore not a zero-filled data matrix.
# define the geometric mean function
<- function(x) exp(mean(log(x)))
gmean
# Region
<-data %>% group_by(Period, region) %>% summarize(nloc=n_distinct(loc_id)) %>% filter(nloc>=10) %>% dplyr::select(-nloc)
nsite
<-right_join(data, nsite, by=c("Period", "region"))
mean.sp
<-mean.sp %>% group_by(Period, region, REPORT_AS) %>% summarise(mean.group=gmean(how_many))
mean.sp
<-cast(mean.sp, REPORT_AS+region~Period, value="mean.group")
mean.sp
write.table(format(mean.sp, digits=3), file = paste(out.dir,"Mean group size of species_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Provincial
<-data %>% group_by(Period, Prov) %>% summarize(nloc=n_distinct(loc_id)) %>% filter(nloc>=10) %>% dplyr::select(-nloc)
nsite
<-right_join(data, nsite, by=c("Period", "Prov"))
mean.sp
<-mean.sp %>% group_by(Period, Prov, REPORT_AS) %>% summarise(mean.group=gmean(how_many))
mean.sp
<-cast(mean.sp, REPORT_AS+region~Period, value="mean.group")
mean.sp
write.table(format(mean.sp, digits=3), file = paste(out.dir,"Mean group size of species_prov.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-data %>% group_by(Period) %>% summarize(nloc=n_distinct(loc_id)) %>% filter(nloc>=10) %>% dplyr::select(-nloc)
nsite
<-right_join(data, nsite, by=c("Period"))
mean.sp
<-mean.sp %>% group_by(Period, REPORT_AS) %>% summarise(mean.group=gmean(how_many))
mean.sp
<-cast(mean.sp, REPORT_AS~Period, value="mean.group")
mean.sp
write.table(format(mean.sp, digits=3), file = paste(out.dir,"Mean group size of species_national.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
4.5 Percent of sites with species at least once and mean group size
Now we will do something a little more complicated: calculate the percent of sites with species (REPORT_AS
) at least once in a season and mean group size. This is calculated for species that occur at > 10 feeder sites.
This summary statistics needs to be done on a zero-fill data matrix to correct for the number of sites an individual could have been seen but was not. To do this, we will use the Range_prov
or Range_blockcode
table, and the sampling Events
table (see Chapter 3).
# Region
# Create regional table outside loop
<- as.data.frame(matrix(data = NA, nrow = 1, ncol = 5, byrow = FALSE, dimnames = NULL))
per.site.regionnames(per.site.region) <- c("Region", "Period", "MeanGroup", "PercentSite", "Species")
write.table(per.site.region, file = paste(out.dir,"% sites with species at least once in season and mean group size_region.csv"), row.names = FALSE, append = FALSE, quote = FALSE, sep = ",")
# Read in your tables from the Output folder
<-fread("Output/Range_prov.csv")
range<-data.frame(range)
range<-fread("Output/Events.csv")
events
<-unique(data$REPORT_AS)
sp.list
# Regional species loop
for(n in 1:length(sp.list)) {
# n<-1 #for testing each species
<-NULL #clear previous dataframe
events1<-events %>% dplyr::select(loc_id, sub_id, day, month, year, Period, Prov, region)%>% filter(year!="NA")
events1
<-NULL #clear previous dataframe
sp.data <- filter(data, REPORT_AS == sp.list[n]) %>%
sp.data droplevels()
if(nrow(sp.data)>0){ #only continue if there is data
<-min(sp.data$year)
min.year<-max(sp.data$year)
max.year
# Filter events data using the sp.data min and max year
<-events1 %>% filter(year>=min.year & year<=max.year)
events1
# Merge sp.data to events data
<-dplyr::left_join(events1, sp.data, by=c("loc_id", "sub_id", "day", "month", "year", "Period", "Prov", "region"))
sp.data<-sp.data %>% filter(loc_id != "NA")
sp.data
<-NULL #clear previous dataframe
range.data<-try(range %>% dplyr::select(Prov, sp.list[n]), silent=T)
range.data
if(class(range.data) !="try-error"){
# Join tables
<-left_join(sp.data, range.data, by="Prov")
sp.data
#remove blocks that should not be included in the zero count
colnames(sp.data)[colnames(sp.data) == sp.list[n]] <- "sp"
<-sp.data %>% filter(sp>=1)
sp.data
# Zero fill the 'how_many' column
$how_many[is.na(sp.data$how_many)] <- 0
sp.data
# Now we can start building the summary statistics
# Determine the number of distinct locations and the mean group size each location
# Filter the data to include only species that have at least once per site, and seen at > 10 sites.
<-sp.data %>% group_by(region, Period) %>%
per.site.regionmutate(loc_id_sp = ifelse(how_many==0, NA, loc_id)) %>%
summarize(tot.site=n_distinct(loc_id), n.site.sp=n_distinct(loc_id_sp), sp.sum=sum(how_many), mean.group=mean(how_many)) %>% filter(sp.sum > 0) %>% filter(tot.site>=10) %>% mutate(per.site=((n.site.sp-1)/tot.site)*100) %>% #subtract 1 for NAs
::select(-sp.sum, -n.site.sp, -tot.site)
dplyr
$SpeciesCode<-sp.list[n]
per.site.region<-as.data.frame(per.site.region)
per.site.region
write.table(format(per.site.region, digits=3), file = paste(out.dir,"% sites with species at least once in season and mean group size_region.csv"), row.names = FALSE, col.name = FALSE, append = TRUE, quote = FALSE, sep = ",")
# end try catch
} # end if nrow>0
} #end species loop
}
########################################
# Province
# Create provincial table outside loop
<- as.data.frame(matrix(data = NA, nrow = 1, ncol = 5, byrow = FALSE, dimnames = NULL))
per.site.provnames(per.site.prov) <- c("Prov", "Period", "MeanGroup", "PercentSite", "Species")
write.table(per.site.prov, file = paste(out.dir,"% sites with species at least once in season and mean group size_prov.csv"), row.names = FALSE, append = FALSE, quote = FALSE, sep = ",")
<-fread("Output/Range_prov.csv")
range<-data.frame(range)
range<-fread("Output/Events.csv")
events
<-unique(data$REPORT_AS)
sp.list
# Provincial species loop
for(n in 1:length(sp.list)) {
# n<-1 #for testing each species
<-NULL #clear previous dataframe
events1<-events %>% dplyr::select(loc_id, sub_id, day, month, year, Period, Prov, region)%>% filter(year!="NA")
events1
<-NULL
sp.data <- filter(data, REPORT_AS == sp.list[n]) %>%
sp.data droplevels()
if(nrow(sp.data)>0){ #only continue if there is data
<-min(sp.data$year)
min.year<-max(sp.data$year)
max.year
# Filter events data using the sp.data min and max year
<-events1 %>% filter(year>=min.year & year<=max.year)
events1
# First merge to events data
<-dplyr::left_join(events1, sp.data, by=c("loc_id", "sub_id", "day", "month", "year", "Period", "Prov", "region"))
sp.data<-sp.data %>% filter(loc_id != "NA")
sp.data
<-NULL
range.data
# Some oddly labeled species are not in the block list. We therefore need a trycath to catch these errors so that the loop doesn't get broken.
<-try(range %>% dplyr::select(Prov, sp.list[n]), silent=TRUE)
range.data
if(class(range.data) !="try-error"){
# Join tables
<-left_join(sp.data, range.data, by="Prov")
sp.data
# Remove blocks that should not be included in the zero count
colnames(sp.data)[colnames(sp.data) == sp.list[n]] <- "sp"
<-sp.data %>% filter(sp>=1)
sp.data
# Zero fill the 'how_many' column
$how_many[is.na(sp.data$how_many)] <- 0
sp.data
# Now we can start building the summary statistics
# Determine the number of distinct locations and the mean group size each location
<-sp.data %>% group_by(Prov, Period) %>%
per.site.provmutate(loc_id_sp = ifelse(how_many==0, NA, loc_id)) %>%
summarize(tot.site=n_distinct(loc_id), n.site.sp=n_distinct(loc_id_sp), sp.sum=sum(how_many), mean.group=mean(how_many)) %>% filter(sp.sum > 0) %>% filter(tot.site>=10) %>% mutate(per.site=((n.site.sp-1)/tot.site)*100) %>% #subtract 1 for NAs
::select(-sp.sum, -n.site.sp, -tot.site)
dplyr
$SpeciesCode<-sp.list[n]
per.site.prov<-as.data.frame(per.site.prov)
per.site.prov
write.table(format(per.site.prov, digits=3), file = paste(out.dir,"% sites with species at least once in season and mean group size_prov.csv"), row.names = FALSE, col.name = FALSE, append = TRUE, quote = FALSE, sep = ",")
#end try catch
} # end if nrow>0
} #end species loop
}
########################################
# National
# Create provincial table outside loop
<- as.data.frame(matrix(data = NA, nrow = 1, ncol = 4, byrow = FALSE, dimnames = NULL))
per.site.natnames(per.site.nat) <- c("Period", "MeanGroup", "PercentSite", "Species")
write.table(per.site.nat, file = paste(out.dir,"% sites with species at least once in season and mean group size_nat.csv"), row.names = FALSE, append = FALSE, quote = FALSE, sep = ",")
<-fread("Output/Range_prov.csv")
range<-data.frame(range)
range<-fread("Output/Events.csv")
events
<-unique(data$REPORT_AS)
sp.list
# Provincial species loop
for(n in 1:length(sp.list)) {
# n<-1 #for testing each species
<-NULL #clear previous dataframe
events1<-events %>% dplyr::select(loc_id, sub_id, day, month, year, Period, Prov, region)%>% filter(year!="NA")
events1
<-NULL
sp.data <- filter(data, REPORT_AS == sp.list[n]) %>%
sp.data droplevels()
if(nrow(sp.data)>0){ #only continue if there is data
<-min(sp.data$year)
min.year<-max(sp.data$year)
max.year
# Filter events data using the sp.data min and max year
<-events1 %>% filter(year>=min.year & year<=max.year)
events1
# First merge to events data
<-dplyr::left_join(events1, sp.data, by=c("loc_id", "sub_id", "day", "month", "year", "Period", "Prov", "region"))
sp.data<-sp.data %>% filter(loc_id != "NA")
sp.data
<-NULL
range.data<-try(range %>% dplyr::select(Prov, sp.list[n]), silent=T)
range.data
if(class(range.data) !="try-error"){
# Join tables
<-left_join(sp.data, range.data, by="Prov")
sp.data
# Remove blocks that should not be included in the zero count
colnames(sp.data)[colnames(sp.data) == sp.list[n]] <- "sp"
<-sp.data %>% filter(sp>=1)
sp.data
# Zero fill the 'how_many' column
$how_many[is.na(sp.data$how_many)] <- 0
sp.data
# Now we can start building the summary statistics
# Determine the number of distinct locations and the mean group size each location
<-sp.data %>% group_by(Period) %>%
per.site.natmutate(loc_id_sp = ifelse(how_many==0, NA, loc_id)) %>%
summarize(tot.site=n_distinct(loc_id), n.site.sp=n_distinct(loc_id_sp), sp.sum=sum(how_many), mean.group=mean(how_many)) %>% filter(sp.sum > 0) %>% filter(tot.site>=10) %>% mutate(per.site=((n.site.sp-1)/tot.site)*100) %>% #subtract 1 for NAs
::select(-sp.sum, -n.site.sp, -tot.site)
dplyr
$SpeciesCode<-sp.list[n]
per.site.nat<-as.data.frame(per.site.nat)
per.site.nat
write.table(format(per.site.nat, digits=3), file = paste(out.dir,"% sites with species at least once in season and mean group size_nat.csv"), row.names = FALSE, col.name = FALSE, append = TRUE, quote = FALSE, sep = ",")
#end try catch
} # end if nrow>0
} #end species loop }
4.6 Top 10 Species
The last summary statistic we will derive is the Top 10 Species each year based on the % of feeders and the mean group size, separately. To do this, we will use the output tables from the previous analysis.
# Load the tables if they are not in your working environment
<-read.csv("Output/ % sites with species at least once in season and mean group size_region.csv")
per.site.region
<-read.csv("Output/ % sites with species at least once in season and mean group size_prov.csv")
per.site.prov
<-read.csv("Output/ % sites with species at least once in season and mean group size_nat.csv")
per.site.nat
# Top 10 species based on % feeders
# Region
<-per.site.region %>% dplyr::select(-MeanGroup) %>% group_by(Period, Region) %>% slice_max(order_by = PercentSite, n = 10)
top10.per.reg<-as.data.frame(top10.per.reg)
top10.per.reg
write.table(format(top10.per.reg, digits=3), file = paste(out.dir,"Top 10 species percent feeders_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Prov
<-per.site.prov %>% dplyr::select(-MeanGroup) %>% group_by(Period, Prov) %>% slice_max(order_by = PercentSite, n = 25)
top10.per.prov<-as.data.frame(top10.per.prov)
top10.per.prov
write.table(format(top10.per.prov, digits=3), file = paste(out.dir,"Top 10 species percent feeders_prov.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-per.site.nat %>% dplyr::select(-MeanGroup) %>% group_by(Period) %>% slice_max(order_by = PercentSite, n = 10)
top10.per.nat<-as.data.frame(top10.per.nat)
top10.per.nat
write.table(format(top10.per.nat, digits=3), file = paste(out.dir,"Top 10 species percent feeders_nat.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Top 10 species based on MeanGroup size
# Region
<-per.site.region %>% dplyr::select(-PercentSite) %>% group_by(Period, Region) %>% slice_max(order_by = MeanGroup, n = 10)
top10.mean.reg<-as.data.frame(top10.mean.reg)
top10.mean.reg
write.table(format(top10.mean.reg, digits=3), file = paste(out.dir,"Top 10 species mean group feeders_region.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# Prov
<-per.site.prov %>% dplyr::select(-PercentSite) %>% group_by(Period, Prov) %>% slice_max(order_by = MeanGroup, n = 10)
top10.mean.prov<-as.data.frame(top10.mean.prov)
top10.mean.prov
write.table(format(top10.mean.prov, digits=3), file = paste(out.dir,"Top 10 species mean group feeders_prov.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
# National
<-per.site.nat %>% dplyr::select(-PercentSite) %>% group_by(Period) %>% slice_max(order_by = MeanGroup, n = 10)
top10.mean.nat<-as.data.frame(top10.mean.nat)
top10.mean.nat
write.table(format(top10.mean.nat, digits=3), file = paste(out.dir,"Top 10 species mean group feeders_nat.csv"), row.names = FALSE, col.name = TRUE, append = FALSE, quote = FALSE, sep = ",")
4.7 Plot Species Means
Now that we have generated the datatables, let’s plot some data of interest.
First, lets look at changes in the mean number of species over time. This can be done regionally, provincially, and/ or nationally with only slight changes to the code below. Also, you can simply change the filter
to whichever species you want to plot over time.
Notice that the
REPORT_AS
is now labled asSpecies
in the output tables.
# Load the tables if they are not in your working environment
<-read.csv("Output/ % sites with species at least once in season and mean group size_region.csv")
per.site.region
<-read.csv("Output/ % sites with species at least once in season and mean group size_prov.csv")
per.site.prov
<-read.csv("Output/ % sites with species at least once in season and mean group size_nat.csv")
per.site.nat
# Filter for the species of interest
<-per.site.region %>% filter(Species=="amecro")
sp.mean
# Plot the mean number
ggplot(sp.mean, aes(Period, MeanGroup))+
geom_line(aes(colour=Region), size=1)+
theme_classic()+
theme(text=element_text(size=20))+
ylab("Mean number per site")
# Plot the % feeders
ggplot(sp.mean, aes(Period, PercentSite))+
geom_line(aes(colour=Region), size=1)+
theme_classic()+
theme(text=element_text(size=20))+
ylab("% feeder sites")
Next we are going to summarize effort in Chapter 5.