library(dplyr)
library(sbtools)
library(stringr)
library(worrms)
library(tidyr)
<- '5a709594e4b0a9a2e9d88e4e'
sb_id
<- item_list_files(sb_id = sb_id)
sb_filenames
<-readr::read_csv(file = sb_filenames$url[1])
BTA <- readr::read_csv(file = sb_filenames$url[2])
Infauna <- readr::read_csv(file = sb_filenames$url[3]) SedChem
Wrangling the Data
Using R to get the data from ScienceBase
To retrieve and use the data from ScienceBase, R packages dplyr
, sbtools
, stringr
, and worrms
will need to be loaded. After loading these packages, we will read the data in from ScienceBase.
The ScienceBase dataset ID can be found at the end of the ScienceBase link for the dataset.
If the full link to the item on ScienceBase is https://www.sciencebase.gov/catalog/item/5a709594e4b0a9a2e9d88e4e, then the identifier is 5a709594e4b0a9a2e9d88e4e
.
Using the ScienceBase ID, we will get information about the data files using item_list_files(sb_id = sb_id)
and assign it to an object sb_filenames
.
From the object sb_filenames
, we will pull the column url
. This column contains the url needed to download the data file from ScienceBase. Rather than download a local copy of the files, we will read it directly into the memory of our computer with readr:: read_csv(file = sb_filenames$url[n])
, with n being the row number of the file we are reading. Now we have dataframes to work with!
Using R to transform the data
After loading the data, we will have to transform the data to align with DarwinCore standard formats and terms.
Renaming Columns
Some column names may directly correlate with the definitions of some DwC terms (a 1:1 relationship), so we only have to rename them. To preserve the original data, we will create a new table from Infauna
to do the manipulations. Then, we will use rename(newName = oldName)
to assign new column names (function reference page).
<- Infauna %>%
Infauna_StationCore
rename(
locationRemarks = Location,
materialEntityID = CoreID,
locationID = Station,
decimalLatitude = Latitude,
decimalLongitude = Longitude
)
Creating New Columns
Mutating
Some renaming might be slightly more complex, requiring manipulation of a column’s format or content to fit the DwC standard. For these tasks, we will use the mutate
function.
In this case, the column may be in the wrong format, like DateCollected
which needs to be adjusted before being assigned to eventDate
. Others, like eventID
or higherGeography
, are concatenations of other columns (many:1 relationships) and new information, which can be combined with paste
. Mutate
is also used to create new columns, like geodeticDatum
, which was not included in the original dataset (function reference page).
<- Infauna_StationCore %>%
Infauna_StationCore mutate(
geodeticDatum = "WGS84",
eventDate = DateCollected %>%
as.Date("%m/%d/%Y"),
eventID = paste(Site, eventDate %>% as.character(), locationID, materialEntityID,
sep = "_") %>% stringr::str_remove_all(pattern = "-"),
minimumDepthInMeters = Depth,
maximumDepthInMeters = Depth,
locality = paste("BOEM Lease Block", Site),
higherGeography = paste("Gulf of Mexico",
paste("BOEM Lease Block",
sep = " | ")
Site), )
Reconfiguring Tables
For some extensions, like the extendedMeasurementOrFact
extension (eMoF for the cool kids), imported data may have different configurations than required by DwC, requiring reconfiguration.
Pivoting
After renaming and mutating column names, we may have to pivot the table from wide to long format. For extendedMeasurementOrFact
tables, all columns in wide format need to be pivoted into long format.
Example of Wide Format
Measurement | site_01 | site_02 | site_03 |
---|---|---|---|
Temperature_C | 3 | 1 | 2 |
Salinity_ppt | 35 | 36 | 34 |
Example of Long Format
Measurement | site | value |
---|---|---|
Temperature_C | site_01 | 4 |
Temperature_C | site_02 | 3 |
Temperature_C | site_03 | 3 |
Salinity_ppt | site_01 | 37 |
Salinity_ppt | site_02 | 36 |
Salinity_ppt | site_03 | 37 |
This is done using the pivot_longer
function. We will specify what columns to include in the pivot with cols = c(columnsToBeIncluded)
. All column names included in the cols
function will now be in the new column measurementType
using names_to
and the old columns’ values in the new column measurementValue
using values_to
. Other columns for identifiers will also need to be added for the eMoF table and can be found in the guides mentioned in the Data Modeling page.
Using the example table above, the result would look like this:
Below we use pivot to wrangle our actual data into the eMoF table, and the pertinent chunk of code looks something like this:
pivot_longer(
cols = c("Thickness (transverse) of core",
"Depth (spatial coordinate) minimum relative to bed surface in the bed",
"Depth (spatial coordinate) maximum relative to bed surface in the bed",
"Proportion by volume of particles (63-2000um) in the sediment",
"Proportion by volume of particles (0-63um) in the sediment",
"Proportion by volume of particles (>2000um) in the sediment"
),names_to = "measurementType",
values_to = "measurementValue",
values_drop_na = TRUE
)
Here is the full code chunk where we first do a variety of mutates prior to the pivot and show a subset of the resulting table:
<- SedChem %>%
SedChem mutate(
SampleID = CoreID
)
<- Infauna %>%
Infauna_emof bind_rows(SedChem) %>%
rename(
materialEntityID = SampleID
%>%
)
mutate(
eventDate = DateCollected %>%
as.Date("%m/%d/%Y"),
eventID = paste(Site, eventDate %>% as.character(), materialEntityID, sep = "_") %>%
::str_remove_all(pattern = "-"),
stringr"Depth (spatial coordinate) maximum relative to bed surface in the bed" =
str_split_i(Fraction, pattern = "-", i = 2) %>% readr::parse_number() %>%
as.character(),
"Depth (spatial coordinate) minimum relative to bed surface in the bed" =
str_split_i(Fraction, pattern = "-", i = 1) %>% readr::parse_number() %>%
as.character(),
"Proportion by volume of particles (63-2000um) in the sediment" = as.character(Sand),
"Proportion by volume of particles (0-63um) in the sediment" = as.character(Mud),
"Proportion by volume of particles (>2000um) in the sediment" = as.character(Gravel),
"Thickness (transverse) of core" = as.character(round(CoreDiameter, digits = 2))
%>%
)
pivot_longer(
cols = c("Thickness (transverse) of core",
"Depth (spatial coordinate) minimum relative to bed surface in the bed",
"Depth (spatial coordinate) maximum relative to bed surface in the bed",
"Proportion by volume of particles (63-2000um) in the sediment",
"Proportion by volume of particles (0-63um) in the sediment",
"Proportion by volume of particles (>2000um) in the sediment"
),names_to = "measurementType",
values_to = "measurementValue",
values_drop_na = TRUE
%>%
)
select(
eventID,
measurementType,
measurementValue%>%
) distinct() %>%
slice(922:931) %>%
print()
# A tibble: 7 × 3
eventID measurementType measurementValue
<chr> <chr> <chr>
1 AT357_20140504_AT26144706054 Proportion by volume of particl… 44.8851836
2 AT357_20140504_AT26144706054 Proportion by volume of particl… 33.79811283
3 MC036_20140627_NA043023 Depth (spatial coordinate) mini… 0
4 MC036_20140627_NA043023 Depth (spatial coordinate) maxi… 10
5 MC036_20140627_NA043023 Proportion by volume of particl… 2.696320665
6 MC036_20140627_NA043023 Proportion by volume of particl… 97.04856398
7 MC036_20140627_NA043023 Proportion by volume of particl… 0.255115353
Joins
Joins are useful if data from multiple tables need to be included in the final table, like when we make a new table with WoRMS data, which is covered in the next section. left_join
will join table x
to table y
by values in the specified columns, so our function will join the tables Infauna_Occurrence
and uniqueAphiaSelectColumns
by matching AphiaID
from both (function reference page).
After combining these tables and mutating our final iteration of scientificNameID
, we have finished our occurrence
table!
# in this case, TSN was included as taxonRemark because it was listed in the original dataset, however it is not required to have both AphiaID and TSN
<- left_join(Infauna_Occurrence, uniqueAphiaSelectColumns, by = c("AphiaID" = "AphiaID")) %>%
Occurrence_Ext mutate(
taxonRemarks = ifelse(is.na(TSN), NA, paste0("urn:lsid:itis.gov:itis_tsn:", TSN))
)
Aligning to a Taxonomic Database
In this dataset, we were lucky enough to have identifiers from the WoRMS databse, which is what OBIS expects the taxonomy to be aligned to. However, occurrence tables require some taxonomic information (e.g. kingdom, lsid) that may not be provided in the original data. In these cases, we will need to call taxonomic information from WoRMS using the worrms
package. In addition to the required scientificName
and scientificNameID
fields, it is valuable to provide other information if it is available, like taxonRank
, which is the lowest identifiable taxon of an occurrence, and corresponding parent taxa.
We will assign AphiaID
from the occurrence table to a new variable myAphiaID
which we will use to call the corresponding WoRMS data. Adding lapply
to circumvent limits on the number of inputs, we then use wm_record(id = x)
, where x
is myAphiaID
. From the new table, we just pull the columns that we need using select
.
<- Infauna$AphiaID %>% na.omit() %>% unique()
myAphiaID
<- lapply(myAphiaID, function(x) wm_record(id = x)) %>%
myAphiaID ::rbindlist()
data.table
<- select(.data =
uniqueAphiaSelectColumns
myAphiaID,
scientificname,
rank,
kingdom,
phylum,
class,
order,
family,
genus,
lsid,
AphiaID%>%
) rename(
scientficName = scientificname,
taxonRank = rank,
scientificNameID = lsid
)
Outputting Data
Once our data has been cleaned up, mapped, and is ready to be uploaded to the IPT, we will have to output the data as a csv file. To do this, first use the select
function to choose what columns to include in the final table. For example, for the event table, it will look something like this:
<- bind_rows(Infauna_StationCore, Infauna_Sample) %>%
Infauna_Event select(
eventID,
parentEventID,
eventDate,
locationID,
decimalLatitude,
decimalLongitude,
higherGeography,
locality,
geodeticDatum,
minimumDepthInMeters,
maximumDepthInMeters,
samplingProtocol,
locationRemarks,
minimumDistanceAboveSurfaceInMeters,
maximumDistanceAboveSurfaceInMeters,
materialEntityID%>%
) distinct()
# since there were multiple occurrences listed for events in the original dataset, here, we used distinct to make sure only unique events are included
Then to export as a csv
file, we will use write.csv
. In in this case, we named the new file gomx_sediment_macrofauna_event_(date of export).csv
. If you are planning to publish your data, it is important to make sure na = ""
, so that valueless columns will be left blank, and that fileEncoding = "UTF-8"
, which is the formatting GBIF and OBIS use.
# checks if data directory exists and if not, creates it
if(!dir.exists("../data")){
dir.create("data")
}
%>%
Infauna_Event write.csv(
paste0(here::here("data", "gomx_sediment_macrofauna_event_"), Sys.Date(), ".csv"),
na = "",
fileEncoding = "UTF-8",
row.names = FALSE
)
#make sure you don't pipe write.csv from the previous chunk
Now we have a file we can upload to the IPT.