SequenceLogos examples with RFAM

import GitHub
import PyPlot
import SequenceLogos
using Downloads: download
using Statistics: mean
using LogExpFunctions: xlogx

Fetch RNA family alignment RF00162 from RFAM (pre-stored as a Github Gist)

data = GitHub.gist("b63e87024fac287a1800b1555276a04b")
url = data.files["RF00162-trimmed.afa"]["raw_url"]
path = download(url; timeout = Inf)

Parse lines

seqs = String[]
for line in eachline(path)
    if startswith(line, '>')
        continue
    else
        push!(seqs, line)
    end
end

RNA nucleotides

NTs = "ACGU-";

One-hot representation

function onehot(s::String)
    return reshape(collect(s), 1, length(s)) .== collect(NTs)
end
X = reshape(reduce(hcat, onehot.(seqs)), 5, :, length(seqs));

Sequence logo

xlog2x(x) = xlogx(x) / log(oftype(x,2))
logo_from_matrix(w::AbstractMatrix) = SequenceLogos.logo_from_matrix(w, replace(NTs, '-' => '⊟'))


function seqlogo_entropic(p::AbstractMatrix; max_ylim=true)
    @assert size(p, 1) == 5 # nucleotides + gap
    w = p ./ sum(p; dims=1)
    H = sum(-xlog2x.(w); dims=1)
    @assert all(0 .≤ H .≤ log2(5))
    SequenceLogos.plot_sequence_logo(logo_from_matrix(w .* (log2(5) .- H)), SequenceLogos.nt_color)
    max_ylim && PyPlot.matplotlib.pyplot.ylim((0, log2(5)))
    PyPlot.matplotlib.pyplot.ylabel("conservation (bits)")
    PyPlot.matplotlib.pyplot.xlabel("site")
    return nothing
end
seqlogo_entropic (generic function with 1 method)

Plot!

PyPlot.matplotlib.pyplot.figure(figsize=(15,2))
seqlogo_entropic(reshape(mean(X; dims=3), 5, 108))
PyPlot.matplotlib.pyplot.gcf()

This page was generated using Literate.jl.