大师网-带你快速走向大师之路 解决你在学习过程中的疑惑,带你快速进入大师之门。节省时间,提升效率

ROSALIND Bioinformatics(21-25)

刷题ROSALIND,练编程水平
http://rosalind.info/problems/list-view/

21. Locating Restriction Sites(查找限制位点)

image.png
  • 解题思路
    • 限制性核酸内切酶用于抵御外来病毒的DNA的进攻
    • 限制性核酸内切酶的酶切位点是回文片段——其反向互补链等于本身
    • 如GATATC | CTATAG
s="TCAATGCATGCGGGTCTATATGCAT"

def pa_re(s):
    l=[]
    for i in s:
        if i=="G":l.append("C")
        if i=="A":l.append("T")
        if i=="T":l.append("A")
        if i=="C":l.append("G")
    l.reverse()
    re_s="".join(l)
    #print re_s
    if s==re_s:
        return 1

print (len(s))
for i in range(len(s)):
    j=4
    while i+j<=len(s):
        n=s[i:i+j]
        if pa_re(n):print (i+1,j)
        j=j+2
pa_re(s)
string = 'TCAATGCATGCGGGTCTATATGCAT'

for start in range(0,len(string)-2):
    # 回文序列长度在4-12,所以当len大于12时取12,小于4时取4
    maxle = min(14,len(string)-start)
    if maxle <= 4:
        maxle = 6
    # 回文序列必须是偶数
    for length in range(4,maxle,2):
        reverse_seq = string[start:start+length]
        #print(reverse_seq)
        #判断回文序列
        for i in range(length):
            if (reverse_seq[i]=='A' and reverse_seq[-1-i]=='T') \
            or (reverse_seq[i]=='T' and reverse_seq[-1-i]=='A')\
            or (reverse_seq[i]=='C' and reverse_seq[-1-i]=='G') \
            or (reverse_seq[i]=='G' and reverse_seq[-1-i]=='C'):
                if i == length/2:
                    print (start+1,length)
                    break
            else:
                break

22. RNA Splicing(RNA 剪切)

image.png
  • 解题思路
    • 用第二条和第三条和第一条匹配,把匹配到的字符窜替换为空
def translate_rna(sequence):
    codonTable = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'', 'TAG':'',
    'TGC':'C', 'TGT':'C', 'TGA':'', 'TGG':'W',
    }
    proUeinsequence = ''

    for n in range(0,len(sequence),3):
        if sequence[n:n+3] in codonTable.keys():
            proUeinsequence += codonTable[sequence[n:n+3]]
    return proUeinsequence


seq_list = []
stseq = ''
for line in open('22_RNA_splicing.txt'):
    if line[0] == '>':
        if stseq != '':
            seq_list.append(stseq)
            stseq = ''
    else:
        stseq = stseq + line.strip('\n')
seq_list.append(stseq)
print(seq_list)

for intron in seq_list[1:]:
    # 把匹配到的字符窜替换为""(空)
    seq_list[0]=seq_list[0].replace(intron,'')

print (translate_rna(seq_list[0]))
image.png

23. Enumerating k-mers Lexicographically(枚举k-mers按字典顺序排列)

image.png
  • 解题思路
    • 用itertools.permutations函数
import itertools
list1 = ['A','C','G','T']
k = 2
b = itertools.permutations(list1,k)
c = itertools.combinations_with_replacement(list1,k)
union = list(set(b).union(set(c)))
for one in list(union):
    print(''.join(map(str,list(one))))

GG
AA
AC
AT
CT
AG
CA
GA
TG
GC
TC
CG
GT
TA
TT
CC
import itertools
list1 = ['A','C','G','T']
k = 2
b = itertools.product(list1,repeat=k)
for i in list(b):
    print(i)

24. Longest Increasing Subsequence(最长升序子序列)

image.png
  • 解题思路
    • 两个函数一个找升序,一个找降序
def increase_seq(a):
    list_down = []
    i = 0
    while i < len(a):
        if a[i] >= max(a[i:]):
            list_down.append(a[i])
            i = i+1
        else:
            list_down.append(max(a[i+1:]))
            i = a.index(max(a[i+1:]))+1
    return list_down

def decrease_seq(a):
    j=0
    list_up =[]
    while j <= len(a)-1:
        if a[j] <= min(a[j:]):
            list_up.append(a[j])
            j = j+1
        else:
            list_up.append(min(a[j+1:]))
            j = a.index(min(a[j+1:]))+1
    return list_up

a = [5,1,4,2,3]
print(increase_seq(a))
print(decrease_seq(a))

[5, 4, 3]
[1, 2, 3]

25. Genome Assembly as Shortest Superstring(寻找最小的超字符窜)

image.png
  • 解题思路
    • 两两之间比较,找最大的重叠
seq_list = []
stseq = ''
for line in open('25_Genome_Assembly_as_Shortest_Superstring'):
    if line[0] == '>':
        if stseq != '':
            seq_list.append(stseq)
            stseq = ''
    else:
        stseq = stseq + line.strip('\n')
seq_list.append(stseq)


def MergeMaxOverlap(str_list):
    max_length = -1

    for prefix_index in range(len(str_list)):
        # 不与自己匹配
        for suffix_index in [num for num in range(len(str_list)) if num != prefix_index]:
            prefix, suffix = str_list[prefix_index], str_list[suffix_index]
            i = 0
            # 如果没有找到相似序列,指针继续+1
            while prefix[i:] != suffix[0:len(prefix[i:])]:
                i += 1
            # 只要匹配大于-1,就改变max_length
            if len(prefix) - i > max_length:
                max_length = len(prefix) - i
                max_indicies = [prefix_index, suffix_index]

    return [str_list[j] for j in range(len(str_list)) if j not in max_indicies] + [
        str_list[max_indicies[0]] + str_list[max_indicies[1]][max_length:]]


def LongestCommonSuperstring(str_list):
    while len(str_list) > 1:
        str_list = MergeMaxOverlap(str_list)

    return str_list[0]


if __name__ == '__main__':
    dna_list = [i for i in seq_list]
    super_string = LongestCommonSuperstring(dna_list)

    print (super_string)

ATTAGACCTGCCGGAATAC
image.png