Title: | Viral Variant Location and Diversity |
---|---|
Description: | Analysis of minor alleles in Illumina sequencing data of viral genomes. Functions in 'vivaldi' primarily operate on vcf files. |
Authors: | Marissa Knoll [aut], Katherine Johnson [aut], Megan Hockman [aut], Eric Borenstein [aut], Mohammed Khalfan [aut], Elodie Ghedin [aut], David Gresham [aut, cre, cph] |
Maintainer: | David Gresham <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.0.1 |
Built: | 2025-03-12 05:18:00 UTC |
Source: | https://github.com/greshamlab/vivaldi |
Adds metadata information to the vcf dataframe
add_metadata(df, metadf, by_vcf, by_meta)
add_metadata(df, metadf, by_vcf, by_meta)
df |
A rearranged vcf dataframe (arrange_data) |
metadf |
A metadata dataframe |
by_vcf |
A vector of column names in the vcf dataframe that should be used to merge the vcf data with the metadata |
by_meta |
A vector of column names in the metadata dataframe that should be used to merge the metadata with the vcf data |
A vcf dataframe with metadata included
df <- data.frame(CHROM = c("A", "B"), POS = c(234, 240), REF = c("G", "A"), ALT = c("A", "G") ) sizes <- data.frame(segment = c("A", "B"), SegmentSize = c(2280, 2274) ) df sizes # Add a new column of sizes of the segments which are necessary for # downstream calculations such as transition:transversion (tstv) and dNdS. add_metadata(df, sizes, c('CHROM'), c('segment'))
df <- data.frame(CHROM = c("A", "B"), POS = c(234, 240), REF = c("G", "A"), ALT = c("A", "G") ) sizes <- data.frame(segment = c("A", "B"), SegmentSize = c(2280, 2274) ) df sizes # Add a new column of sizes of the segments which are necessary for # downstream calculations such as transition:transversion (tstv) and dNdS. add_metadata(df, sizes, c('CHROM'), c('segment'))
Plots distribution of all minor variants
af_distribution(df)
af_distribution(df)
df |
A dataframe that has been arranged (arrange_data) and filtered (filter_variants) |
plots with the distribution of all minor variants
# Example 1: df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), REF = c("G", "G", "A", "A", "C"), ALT = c("A", "A", "G", "G", "T"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), minorcount = c(7, 15, 26, 32, 7), majorcount = c(709, 661, 574, 547, 610), gt_DP = c(716, 676, 600, 579, 617) ) af_distribution(df) # Example 2: af_distribution(example_filtered_SNV_df)
# Example 1: df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), REF = c("G", "G", "A", "A", "C"), ALT = c("A", "A", "G", "G", "T"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), minorcount = c(7, 15, 26, 32, 7), majorcount = c(709, 661, 574, 547, 610), gt_DP = c(716, 676, 600, 579, 617) ) af_distribution(df) # Example 2: af_distribution(example_filtered_SNV_df)
Reads in a directory of VCF files and converts them into a single dataframe
arrange_data( vardir, reference_fasta, annotated = "yes", ntlist = c("A", "G", "T", "C", "-"), verbose = FALSE )
arrange_data( vardir, reference_fasta, annotated = "yes", ntlist = c("A", "G", "T", "C", "-"), verbose = FALSE )
vardir |
Directory path containing vcf files |
reference_fasta |
Reference fasta file used for alignment |
annotated |
Whether the VCF files are annotated using snpeff "yes" or "no" (default "yes") |
ntlist |
Nucleotides (default A, T, G, C) used for finding multiple alt alleles |
verbose |
set verbosity of the vcfR commands |
A large dataframe containing information from all input VCF files
Reads in a dataframe that has been arranged (arrange_data), filtered (filter_variants), and annotated (prepare_annotations), calculates dNdS, and outputs plots
dNdS_segment(annotation_df, SPLICEFORMS)
dNdS_segment(annotation_df, SPLICEFORMS)
annotation_df |
A rearranged, filtered, and annotated vcf dataframe - must be for amino-acid specific calculations, cannot be the same as the dataframe used for SNP calculations |
SPLICEFORMS |
A character vector of isoform names |
A plot showing the dN/dS ratio for each splice form (rather than segment) for each sample
# Sample Data head(example_filtered_SNV_df) dim(example_filtered_SNV_df) # Plot showing the dN/dS ratio for each splice form SPLICEFORMS = c("H1N1_PB2.1", "H1N1_PB1.1","H1N1_NS.2") dNdS_segment(example_filtered_SNV_df, SPLICEFORMS)
# Sample Data head(example_filtered_SNV_df) dim(example_filtered_SNV_df) # Plot showing the dN/dS ratio for each splice form SPLICEFORMS = c("H1N1_PB2.1", "H1N1_PB1.1","H1N1_NS.2") dNdS_segment(example_filtered_SNV_df, SPLICEFORMS)
Example Dataframe The DF_filt_SNVs dataframe created in the vignette
example_filtered_SNV_df
example_filtered_SNV_df
## 'example_filtered_SNV_df' A data frame with 735 rows and 57 columns:
Filters single-nucleotide variants using a coverage and frequency cutoff
filter_variants(df, coverage_cutoff = 200, frequency_cutoff = 0.03)
filter_variants(df, coverage_cutoff = 200, frequency_cutoff = 0.03)
df |
A rearranged VCF dataframe (rearranged using the arrange_data function) |
coverage_cutoff |
The coverage cutoff for calling a SNV (default: 200x) |
frequency_cutoff |
Frequency cutoff for calling a SNV (default: 3%) |
A filtered VCF dataframe
df <- data.frame(CHROM = c("A", "B", "C"), POS = c(234, 240, 255), ALT_FREQ = c(0.016, 0.049, 0.031), gt_DP = c(716, 600, 187) ) df # Default: filter by 3% frequency threshold and 200 coverage cutoff filter_variants(df) # Example 1: A 1% allele frequency threshold and 200 coverage cutoff filter_variants(df, coverage_cutoff = 200, frequency_cutoff = 0.01) # Example 2: A 2% allele frequency threshold and 100 coverage cutoff filter_variants(df, coverage_cutoff = 100, frequency_cutoff = 0.02)
df <- data.frame(CHROM = c("A", "B", "C"), POS = c(234, 240, 255), ALT_FREQ = c(0.016, 0.049, 0.031), gt_DP = c(716, 600, 187) ) df # Default: filter by 3% frequency threshold and 200 coverage cutoff filter_variants(df) # Example 1: A 1% allele frequency threshold and 200 coverage cutoff filter_variants(df, coverage_cutoff = 200, frequency_cutoff = 0.01) # Example 2: A 2% allele frequency threshold and 100 coverage cutoff filter_variants(df, coverage_cutoff = 100, frequency_cutoff = 0.02)
Merges replicate VCF files into a single dataframe
merge_replicates(vardf, repdata, nameofrep1, nameofrep2, commoncols)
merge_replicates(vardf, repdata, nameofrep1, nameofrep2, commoncols)
vardf |
Data frame of variants |
repdata |
Data frame of replicate information |
nameofrep1 |
Name of variable representing the first replicate, must be written with quotes |
nameofrep2 |
Name of variable representing the second replicate |
commoncols |
List of columns to merge the replicates by |
a data frame containing replicate information
df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), REF = c("G", "G", "A", "A", "C"), ALT = c("A", "A", "G", "G", "T"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), minorcount = c(7, 15, 26, 32, 7), majorcount = c(709, 661, 574, 547, 610), gt_DP = c(716, 676, 600, 579, 617) ) # Dataframe shows a pair of replicates and their variants at 3 positions. df replicates <- data.frame(filename = c("m1","m2"), replicate = c("rep1", "rep2"), sample = c("a_2_iv", "a_2_iv") ) # Dataframe showing relationship between filename, replicate, and sample name replicates # Merge by the following columns cols = c("sample","CHROM","POS","REF","ALT") merge_replicates(df, replicates, "rep1", "rep2", cols) # The dataframe now contains the 2 variants at positions 234 & 240 that were # detected in both sequencing replicates whereas the variant at position 254 # was only in a single replicate so it was removed during the merge.
df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), REF = c("G", "G", "A", "A", "C"), ALT = c("A", "A", "G", "G", "T"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), minorcount = c(7, 15, 26, 32, 7), majorcount = c(709, 661, 574, 547, 610), gt_DP = c(716, 676, 600, 579, 617) ) # Dataframe shows a pair of replicates and their variants at 3 positions. df replicates <- data.frame(filename = c("m1","m2"), replicate = c("rep1", "rep2"), sample = c("a_2_iv", "a_2_iv") ) # Dataframe showing relationship between filename, replicate, and sample name replicates # Merge by the following columns cols = c("sample","CHROM","POS","REF","ALT") merge_replicates(df, replicates, "rep1", "rep2", cols) # The dataframe now contains the 2 variants at positions 234 & 240 that were # detected in both sequencing replicates whereas the variant at position 254 # was only in a single replicate so it was removed during the merge.
Reads in a dataframe that has been arranged (arrange_data), filtered (filter_variants), and piped through the Shannon calculations (shannon_entropy) and outputs plots
plot_shannon(shannon_df)
plot_shannon(shannon_df)
shannon_df |
A dataframe that has been arranged (arrange_data), filtered (filter_variants), and piped through the Shannon calculations (shannon_entropy) |
The 'plot_shannon()' function takes the variant dataframe and generates three plots. 1. The Shannon entropy, or amount of diversity, at each position in the genome at which a variant was found. 2. The Shannon entropy summed over each segment 3. The Shannon entropy summed over each genome A higher value indicates more diversity.
Three plots showing the nt Shannon, chrom Shannon, and full genome Shannon calculations
# Sample dataframe df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), SegmentSize = c(2280, 2280, 2274, 2274, 1809) ) df genome_size = 13133 # Modify the dataframe to add 5 new columns of shannon entropy data: # 1. shannon_ntpos # 2. chrom_shannon # 3. genome_shannon # 4. shannon_chrom_perkb # 5. genome_shannon_perkb shannon_df = shannon_entropy(df, genome_size) # Plot plot_shannon(shannon_df)
# Sample dataframe df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), POS = c(234, 234, 240, 240, 254), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), SegmentSize = c(2280, 2280, 2274, 2274, 1809) ) df genome_size = 13133 # Modify the dataframe to add 5 new columns of shannon entropy data: # 1. shannon_ntpos # 2. chrom_shannon # 3. genome_shannon # 4. shannon_chrom_perkb # 5. genome_shannon_perkb shannon_df = shannon_entropy(df, genome_size) # Plot plot_shannon(shannon_df)
Reads in a dataframe that has been arranged (arrange_data) and filtered (filter_variants) and outputs plots
position_allele_freq(vardf, segment, nt)
position_allele_freq(vardf, segment, nt)
vardf |
A rearranged (arrange_data) and filtered (filtered_variants) vcf dataframe |
segment |
Name of segment (must be in quotes) |
nt |
Position on segment (must be in quotes) |
A plot showing the the frequencies of the major and minor allele at the given position across all samples
position_allele_freq(example_filtered_SNV_df,"H1N1_NP", "1247")
position_allele_freq(example_filtered_SNV_df,"H1N1_NP", "1247")
Separates the SNPeff annotations found in an annotated and rearranged VCF dataframe (arranged using arrange_data)
prepare_annotations(df)
prepare_annotations(df)
df |
A rearranged and annotated VCF dataframe |
A dataframe containing each annotation on a separate column
# Example: Shows the separation of the ANN column based on | delimiter. test <- data.frame( ANN = c("A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P")) # The ANN column will be split based on the strings in `snpeff_info()` and # an additional "error" column. snpeff_info() # Split the SNPeff annotations in "ANN" column and save to dataframe `df` df <- prepare_annotations(df) # The one "ANN" column is split into 16 columns dim(test) dim(df)
# Example: Shows the separation of the ANN column based on | delimiter. test <- data.frame( ANN = c("A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P")) # The ANN column will be split based on the strings in `snpeff_info()` and # an additional "error" column. snpeff_info() # Split the SNPeff annotations in "ANN" column and save to dataframe `df` df <- prepare_annotations(df) # The one "ANN" column is split into 16 columns dim(test) dim(df)
Imports reference fasta, generates a dataframe with chroms and chrom lengths
read_reference_fasta_dna(reference_fasta)
read_reference_fasta_dna(reference_fasta)
reference_fasta |
The name and location of the reference fasta file used for alignment |
A dataframe containing the chroms and chrom lengths of a reference fasta
Takes a rearranged vcf dataframe and calculates the Shannon entropy
shannon_entropy(df, genome_size)
shannon_entropy(df, genome_size)
df |
A rearranged vcf dataframe (arrange_data) |
genome_size |
Size of whole genome being used |
Shannon entropy is a commonly used metric to describe the amount of genetic diversity in sequencing data. It is calculated by considering the frequency of the ALT and REF allele at every position and then summing those values over 1) a segment or 2) the entire genome. These values can then be normalized by sequence length (kb) in order to compare across different segments or samples.
A dataframe with Shannon entropy/kb calculations for the chroms and entire genome
# Sample dataframe df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), SegmentSize = c(2280, 2280, 2274, 2274, 1809) ) df genome_size = 13133 # MOdify the dataframe to add 5 new columns of shannon entropy data: # 1. shannon_ntpos # 2. chrom_shannon # 3. genome_shannon # 4. shannon_chrom_perkb # 5. genome_shannon_perkb shannon_entropy(df, genome_size)
# Sample dataframe df <- data.frame(sample = c("m1", "m2", "m1", "m2", "m1"), CHROM = c("PB1", "PB1", "PB2", "PB2", "NP"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989), SegmentSize = c(2280, 2280, 2274, 2274, 1809) ) df genome_size = 13133 # MOdify the dataframe to add 5 new columns of shannon entropy data: # 1. shannon_ntpos # 2. chrom_shannon # 3. genome_shannon # 4. shannon_chrom_perkb # 5. genome_shannon_perkb shannon_entropy(df, genome_size)
Returns vector containing information in snpeff annotations
snpeff_info()
snpeff_info()
Returns vector containing information in snpeff annotations
snpeff_info()
snpeff_info()
Reads in a dataframe that has been arranged (arrange_data) and filtered (filter_variants) and outputs plots
snv_genome(vardf)
snv_genome(vardf)
vardf |
A rearranged (arrange_data) and filtered (filtered_variants) vcf dataframe |
A bar plot showing the number of variants per sample colored by their SNPEff annotation
# Example 1: Simple dataframe df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), annotation = c("downstrean_gene_variant", "synonymous_variant", "synonymous_variant", "stop_gained", "missense_variant", "downstrean_gene_variant", "downstrean_gene_variant", "synonymous_variant", "stop_gained", "missense_variant") ) df snv_genome(df) # Example 2: Sample dataframe snv_genome(example_filtered_SNV_df)
# Example 1: Simple dataframe df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), annotation = c("downstrean_gene_variant", "synonymous_variant", "synonymous_variant", "stop_gained", "missense_variant", "downstrean_gene_variant", "downstrean_gene_variant", "synonymous_variant", "stop_gained", "missense_variant") ) df snv_genome(df) # Example 2: Sample dataframe snv_genome(example_filtered_SNV_df)
Reads in the vcf dataframe and generates a plot showing the frequency and location of SNVs
snv_location(df)
snv_location(df)
df |
A rearranged dataframe |
A plot showing the location and frequency of SNVs found across samples
# Example 1: df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), CHROM = c("PB1", "PB1", "PB2", "PB2", "PB2", "PB1", "PB1", "PB2", "PB2", "PB2"), POS = c(234, 266, 117, 134, 180, 234, 266, 199, 88, 180), major = c("G", "G", "A", "A", "C", "G", "G", "A", "G", "C"), minor = c("A", "A", "G", "G", "T", "A", "A", "G", "A", "T"), ALT_TYPE = c("minor", "minor", "minor", "minor", "minor", "minor", "minor", "minor", "major", "minor"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011, 0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989, 0.990, 0.978, 0.957, 0.945, 0.989) ) df snv_location(df) # Example 2: snv_location(example_filtered_SNV_df)
# Example 1: df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), CHROM = c("PB1", "PB1", "PB2", "PB2", "PB2", "PB1", "PB1", "PB2", "PB2", "PB2"), POS = c(234, 266, 117, 134, 180, 234, 266, 199, 88, 180), major = c("G", "G", "A", "A", "C", "G", "G", "A", "G", "C"), minor = c("A", "A", "G", "G", "T", "A", "A", "G", "A", "T"), ALT_TYPE = c("minor", "minor", "minor", "minor", "minor", "minor", "minor", "minor", "major", "minor"), minorfreq = c(0.010, 0.022, 0.043, 0.055, 0.011, 0.010, 0.022, 0.043, 0.055, 0.011), majorfreq = c(0.990, 0.978, 0.957, 0.945, 0.989, 0.990, 0.978, 0.957, 0.945, 0.989) ) df snv_location(df) # Example 2: snv_location(example_filtered_SNV_df)
Reads in a dataframe that has been arranged (arrange_data) and filtered (filter_variants) and outputs plots
snv_segment(vardf)
snv_segment(vardf)
vardf |
A rearranged (arrange_data) and filtered (filtered_variants) vcf dataframe |
A bar plot showing the number of variants colored by their SNPEff annotation
# Example 1: Simple data df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), CHROM = c("PB1", "PB1", "PB2", "PB2", "PB2", "PB1", "PB1", "PB2", "PB2", "PB2"), annotation = c("downstrean_gene_variant", "synonymous_variant", "synonymous_variant", "stop_gained", "missense_variant", "downstrean_gene_variant", "downstrean_gene_variant", "synonymous_variant", "stop_gained", "missense_variant") ) df snv_segment(df) # Example 2: Sample data snv_segment(example_filtered_SNV_df)
# Example 1: Simple data df <- data.frame(sample = c("m1", "m1", "m1", "m1", "m1", "m2", "m2", "m2", "m2", "m2"), CHROM = c("PB1", "PB1", "PB2", "PB2", "PB2", "PB1", "PB1", "PB2", "PB2", "PB2"), annotation = c("downstrean_gene_variant", "synonymous_variant", "synonymous_variant", "stop_gained", "missense_variant", "downstrean_gene_variant", "downstrean_gene_variant", "synonymous_variant", "stop_gained", "missense_variant") ) df snv_segment(df) # Example 2: Sample data snv_segment(example_filtered_SNV_df)
Groups the input vcf data frame using a list of variables and tallies the number of occurrences
tally_it(df, groupit, new_colname)
tally_it(df, groupit, new_colname)
df |
A rearranged vcf dataframe (arrange_data) |
groupit |
A vector containing column names that data should be grouped by |
new_colname |
The name of the count column |
A dataframe with columns from the 'groupit' vector and the number of times each unique grouping occurs in the data
# Sample dataframe of 7 variants across 2 samples df <- data.frame( sample = c( "sample1", "sample1", "sample1", "sample2", "sample2", "sample2", "sample2"), CHROM = c("PB1", "PB2", "PB2", "LEO", "LEO", "LEO", "ALE"), SegmentSize = c(2280, 2274, 2274, 1701, 1701, 1701, 1888 ), minorfreq = c(0.04422785, 0.03738175, 0.01390202, 0.02927786, 0.03071955, 0.02626025, 0.02875321) ) # Example 1: to get the sum of variants on every segment: groupit = c('sample','CHROM', "SegmentSize") tally_it(df, groupit, "snv_count") # Example 2: to get the count across genomes: groupit = c('sample') tally_it(df, groupit, "snv_count")
# Sample dataframe of 7 variants across 2 samples df <- data.frame( sample = c( "sample1", "sample1", "sample1", "sample2", "sample2", "sample2", "sample2"), CHROM = c("PB1", "PB2", "PB2", "LEO", "LEO", "LEO", "ALE"), SegmentSize = c(2280, 2274, 2274, 1701, 1701, 1701, 1888 ), minorfreq = c(0.04422785, 0.03738175, 0.01390202, 0.02927786, 0.03071955, 0.02626025, 0.02875321) ) # Example 1: to get the sum of variants on every segment: groupit = c('sample','CHROM', "SegmentSize") tally_it(df, groupit, "snv_count") # Example 2: to get the count across genomes: groupit = c('sample') tally_it(df, groupit, "snv_count")
Plots Ts/Tv ratios
tstv_plot(df)
tstv_plot(df)
df |
TsTv dataframe generated using the tstv_ratio function |
two plots showing the K2P and simple Ts/Tv ratios
df <- tstv_ratio(example_filtered_SNV_df,1300) tstv_plot(df)
df <- tstv_ratio(example_filtered_SNV_df,1300) tstv_plot(df)
Inputs a filtered and rearranged vcf dataframe and calculates the transition/transversion ratio
tstv_ratio(df, genome_size)
tstv_ratio(df, genome_size)
df |
The filtered and rearranged variant dataframe |
genome_size |
Size of whole genome being used |
A dataframe containing the calculated transition/transversion ratio (R or basic_tstv)
tstv_ratio(example_filtered_SNV_df, 13000)
tstv_ratio(example_filtered_SNV_df, 13000)