next up previous
Next: util.character Up: CHNOSZ examples Previous: util.array

util.blast

utl.bl>   ## using def2gi
utl.bl>   def <- "gi|218295810|ref|ZP_03496590.1|"

utl.bl>   stopifnot(all.equal(def2gi(def), "218295810"))

utl.bl>   ## process some of the BLAST output for proteins
utl.bl>   ## from Bison Pool metagenome (JGI, 2007)
utl.bl>   # read the file that connects taxids with the sequence identifier
utl.bl>   tfile <- system.file("extdata/bison/gi.taxid.txt.xz", package="CHNOSZ")

utl.bl>   gi.taxid <- scan(tfile, what=as.list(character(2)), flush=TRUE)
Read 17531 records

utl.bl>   # read the file that connects names with the taxids
utl.bl>   nfile <- system.file("extdata/refseq/taxid_names.csv.xz", package="CHNOSZ")

utl.bl>   taxid.names <- read.csv(nfile)

utl.bl>   # the BLAST files
utl.bl>   sites <- c("N","S","R","Q","P")

utl.bl>   bfile <- paste("extdata/bison/bison", sites, "_vs_refseq47.blastp.xz", sep="")

utl.bl>   for(i in 1:5) {
utl.bl+     file <- system.file(bfile[i], package="CHNOSZ")
utl.bl+     # read the blast file, with default filtering settings
utl.bl+     bl <- read.blast(file)
utl.bl+     # process the blast file -- get taxon names
utl.bl+     ib <- id.blast(bl, gi.taxid, taxid.names, min.taxon=2^-7)
utl.bl+     # count each of the phyla
utl.bl+     bd <- as.matrix(sapply(unique(ib$phylum), function(x) (sum(x==ib$phylum))))
utl.bl+     colnames(bd) <- sites[i]
utl.bl+     # make a matrix -- each column for a different file
utl.bl+     if(i==1) bardata <- bd else {
utl.bl+       bardata <- merge(bardata, bd, all=TRUE, by="row.names")
utl.bl+       rownames(bardata) <- bardata$Row.names
utl.bl+       bardata <- bardata[,-1]
utl.bl+     }
utl.bl+   }
read.blast: reading bisonN_vs_refseq47.blastp.xz 
  read 5000 lines with 5000 query sequences
  similarity filtering leaves 5000 lines and 5000 query sequences
  evalue filtering leaves 5000 lines and 5000 query sequences
  max hits filtering leaves 5000 lines and 5000 query sequences
id.blast: getting taxids ... getting taxid.names ... 28 phyla, 400 taxa
  min taxon abundance filtering leaves 4167 query sequences, 4 phyla, 16 taxa
  min phylum abundance filtering leaves 4167 query sequences, 4 phyla, 16 taxa
read.blast: reading bisonS_vs_refseq47.blastp.xz 
  read 5000 lines with 5000 query sequences
  similarity filtering leaves 5000 lines and 5000 query sequences
  evalue filtering leaves 5000 lines and 5000 query sequences
  max hits filtering leaves 5000 lines and 5000 query sequences
id.blast: getting taxids ... getting taxid.names ... 32 phyla, 634 taxa
  min taxon abundance filtering leaves 2747 query sequences, 4 phyla, 16 taxa
  min phylum abundance filtering leaves 2747 query sequences, 4 phyla, 16 taxa
read.blast: reading bisonR_vs_refseq47.blastp.xz 
  read 5000 lines with 5000 query sequences
  similarity filtering leaves 5000 lines and 5000 query sequences
  evalue filtering leaves 5000 lines and 5000 query sequences
  max hits filtering leaves 5000 lines and 5000 query sequences
id.blast: getting taxids ... getting taxid.names ... 31 phyla, 694 taxa
  min taxon abundance filtering leaves 2213 query sequences, 6 phyla, 12 taxa
  min phylum abundance filtering leaves 2168 query sequences, 5 phyla, 11 taxa
read.blast: reading bisonQ_vs_refseq47.blastp.xz 
  read 5000 lines with 5000 query sequences
  similarity filtering leaves 5000 lines and 5000 query sequences
  evalue filtering leaves 5000 lines and 5000 query sequences
  max hits filtering leaves 5000 lines and 5000 query sequences
id.blast: getting taxids ... getting taxid.names ... 29 phyla, 666 taxa
  min taxon abundance filtering leaves 2900 query sequences, 6 phyla, 13 taxa
  min phylum abundance filtering leaves 2900 query sequences, 6 phyla, 13 taxa
read.blast: reading bisonP_vs_refseq47.blastp.xz 
  read 5000 lines with 5000 query sequences
  similarity filtering leaves 5000 lines and 5000 query sequences
  evalue filtering leaves 5000 lines and 5000 query sequences
  max hits filtering leaves 5000 lines and 5000 query sequences
id.blast: getting taxids ... getting taxid.names ... 30 phyla, 718 taxa
  min taxon abundance filtering leaves 2740 query sequences, 6 phyla, 13 taxa
  min phylum abundance filtering leaves 2695 query sequences, 5 phyla, 12 taxa

utl.bl>   # normalize the counts
utl.bl>   bardata[is.na(bardata)] <- 0

utl.bl>   bardata <- t(t(bardata)/colSums(bardata))

utl.bl>   # make a bar chart
utl.bl>   bp <- barplot(as.matrix(bardata), col=rainbow(nrow(bardata)),
utl.bl+     xlab="location", ylab="fractional abundance")

utl.bl>   # add labels to the bars
utl.bl>   names <- substr(row.names(bardata), 1, 3)

utl.bl>   for(i in 1:5) {
utl.bl+     bd <- bardata[,i]
utl.bl+     ib <- bd!=0
utl.bl+     y <- (cumsum(bd) - bd/2)[ib]
utl.bl+     text(bp[i], y, names[ib])
utl.bl+   }

utl.bl>   title(main=paste("Most Abundant Phyla in a Portion",
utl.bl+     "of the Bison Pool Metagenome", sep="\n"))

\begin{figure}\par
\includegraphics{pictures/utilblast1}
\par
\par
 %
\end{figure}


next up previous
Next: util.character Up: CHNOSZ examples Previous: util.array