序列比对问题

输入两个字符串, X        = x_{1} x_{2}... x_{m}        Y        = y_{1} y_{2}... y_{m}        ,逐字输出字符串的对齐方式,这样就可以减少净惩罚 最小化 .罚款计算如下: 1.罚款 p_{gap}        在字符串之间插入间隙时发生。 2.罚款 p_{xy}        由于与的字符不匹配而发生 X        Y        .

null

例如:

Input : X = CG, Y = CA, p_gap = 3, p_xy = 7Output : X = CG_, Y = C_A, Total penalty = 6Input : X = AGGGCT, Y = AGGCA, p_gap = 3, p_xy = 2Output : X = AGGGCT, Y = A_GGCA, Total penalty = 5Input : X = CG, Y = CA, p_gap = 3, p_xy = 5Output : X = CG, Y = CA, Total penalty = 5

关于这个问题历史的简要说明 序列比对问题是生物科学的基本问题之一,旨在寻找两个氨基酸序列的相似性。比较氨基酸对人类至关重要,因为它提供了进化和发展的重要信息。Saul B.Needeman和Christian D.Wunsch为这个问题设计了一种动态规划算法,并于1970年发表。从那时起,为了提高时间复杂度和空间复杂度,已经做了许多改进,但这些都超出了本文讨论的范围。

解决方案 我们可以用动态规划来解决这个问题。可行的解决方案是在弦中引入间隙,以便使长度相等。因为可以很容易地证明,在长度相等后增加额外的间隙只会导致惩罚的增加。

最优子结构 从最优解(例如,从给定的样本输入)可以观察到,最优解仅限于 候选人。 1. x_{m}        y_{n}        . 2. x_{m}        还有缺口。 3.差距和挑战 y_{n}        . 最优子结构的证明。 我们很容易用矛盾来证明。允许 X - x_{m}        X^'        Y - y_{n}        Y^'        .假设 X^'        , Y^'        有一些惩罚 P        ,而竞争对手将受到处罚 P^*        具有 P^* < P        . 现在,附加 x_{m}        y_{n}        ,我们得到了与点球一致的结果 P^* + p_{xy} < P + p_{xy}        .这与原始路线的最佳性相矛盾 X, Y        . 因此,事实证明了这一点。 允许 dp[i][j]        是最佳排列的惩罚吗 X_{i}        Y_{i}        .然后,从最优子结构, dp[i][j] = min(dp[i-1][j-1] + p_{xy}, dp[i-1][j] + p_{gap}, dp[i][j-1] + p_{gap})        . 因此,最低处罚总额为:, dp[m][n]        .

重建解决方案 重建, 1.追溯已填写的表格,从 dp[m][n]        . 2.什么时候 (i, j)        …..2a。如果使用案例1填充,请转到 (i-1, j-1)        . …..2b。如果使用案例2填充,请转到 (i-1, j)        . …..2c。如果使用案例3填充,请转到 (i, j-1)        . 3.如果i=0或j=0,则将剩余子串与间隙匹配。

下面是上述解决方案的实现。

C++

// CPP program to implement sequence alignment
// problem.
#include <bits/stdc++.h>
using namespace std;
// function to find out the minimum penalty
void getMinimumPenalty(string x, string y, int pxy, int pgap)
{
int i, j; // initialising variables
int m = x.length(); // length of gene1
int n = y.length(); // length of gene2
// table for storing optimal substructure answers
int dp[m+1][n+1] = {0};
// initialising the table
for (i = 0; i <= (n+m); i++)
{
dp[i][0] = i * pgap;
dp[0][i] = i * pgap;
}
// calculating the minimum penalty
for (i = 1; i <= m; i++)
{
for (j = 1; j <= n; j++)
{
if (x[i - 1] == y[j - 1])
{
dp[i][j] = dp[i - 1][j - 1];
}
else
{
dp[i][j] = min({dp[i - 1][j - 1] + pxy ,
dp[i - 1][j] + pgap    ,
dp[i][j - 1] + pgap    });
}
}
}
// Reconstructing the solution
int l = n + m; // maximum possible length
i = m; j = n;
int xpos = l;
int ypos = l;
// Final answers for the respective strings
int xans[l+1], yans[l+1];
while ( !(i == 0 || j == 0))
{
if (x[i - 1] == y[j - 1])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int )y[j - 1];
i--; j--;
}
else if (dp[i - 1][j - 1] + pxy == dp[i][j])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int )y[j - 1];
i--; j--;
}
else if (dp[i - 1][j] + pgap == dp[i][j])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int ) '_' ;
i--;
}
else if (dp[i][j - 1] + pgap == dp[i][j])
{
xans[xpos--] = ( int ) '_' ;
yans[ypos--] = ( int )y[j - 1];
j--;
}
}
while (xpos > 0)
{
if (i > 0) xans[xpos--] = ( int )x[--i];
else xans[xpos--] = ( int ) '_' ;
}
while (ypos > 0)
{
if (j > 0) yans[ypos--] = ( int )y[--j];
else yans[ypos--] = ( int ) '_' ;
}
// Since we have assumed the answer to be n+m long,
// we need to remove the extra gaps in the starting
// id represents the index from which the arrays
// xans, yans are useful
int id = 1;
for (i = l; i >= 1; i--)
{
if (( char )yans[i] == '_' && ( char )xans[i] == '_' )
{
id = i + 1;
break ;
}
}
// Printing the final answer
cout << "Minimum Penalty in aligning the genes = " ;
cout << dp[m][n] << "" ;
cout << "The aligned genes are :" ;
for (i = id; i <= l; i++)
{
cout<<( char )xans[i];
}
cout << "" ;
for (i = id; i <= l; i++)
{
cout << ( char )yans[i];
}
return ;
}
// Driver code
int main(){
// input strings
string gene1 = "AGGGCT" ;
string gene2 = "AGGCA" ;
// initialising penalties of different types
int misMatchPenalty = 3;
int gapPenalty = 2;
// calling the function to calculate the result
getMinimumPenalty(gene1, gene2,
misMatchPenalty, gapPenalty);
return 0;
}


JAVA

// Java program to implement
// sequence alignment problem.
import java.io.*;
import java.util.*;
import java.lang.*;
class GFG
{
// function to find out
// the minimum penalty
static void getMinimumPenalty(String x, String y,
int pxy, int pgap)
{
int i, j; // initialising variables
int m = x.length(); // length of gene1
int n = y.length(); // length of gene2
// table for storing optimal
// substructure answers
int dp[][] = new int [n + m + 1 ][n + m + 1 ];
for ( int [] x1 : dp)
Arrays.fill(x1, 0 );
// initialising the table
for (i = 0 ; i <= (n + m); i++)
{
dp[i][ 0 ] = i * pgap;
dp[ 0 ][i] = i * pgap;
}
// calculating the
// minimum penalty
for (i = 1 ; i <= m; i++)
{
for (j = 1 ; j <= n; j++)
{
if (x.charAt(i - 1 ) == y.charAt(j - 1 ))
{
dp[i][j] = dp[i - 1 ][j - 1 ];
}
else
{
dp[i][j] = Math.min(Math.min(dp[i - 1 ][j - 1 ] + pxy ,
dp[i - 1 ][j] + pgap) ,
dp[i][j - 1 ] + pgap );
}
}
}
// Reconstructing the solution
int l = n + m; // maximum possible length
i = m; j = n;
int xpos = l;
int ypos = l;
// Final answers for
// the respective strings
int xans[] = new int [l + 1 ];
int yans[] = new int [l + 1 ];
while ( !(i == 0 || j == 0 ))
{
if (x.charAt(i - 1 ) == y.charAt(j - 1 ))
{
xans[xpos--] = ( int )x.charAt(i - 1 );
yans[ypos--] = ( int )y.charAt(j - 1 );
i--; j--;
}
else if (dp[i - 1 ][j - 1 ] + pxy == dp[i][j])
{
xans[xpos--] = ( int )x.charAt(i - 1 );
yans[ypos--] = ( int )y.charAt(j - 1 );
i--; j--;
}
else if (dp[i - 1 ][j] + pgap == dp[i][j])
{
xans[xpos--] = ( int )x.charAt(i - 1 );
yans[ypos--] = ( int ) '_' ;
i--;
}
else if (dp[i][j - 1 ] + pgap == dp[i][j])
{
xans[xpos--] = ( int ) '_' ;
yans[ypos--] = ( int )y.charAt(j - 1 );
j--;
}
}
while (xpos > 0 )
{
if (i > 0 ) xans[xpos--] = ( int )x.charAt(--i);
else xans[xpos--] = ( int ) '_' ;
}
while (ypos > 0 )
{
if (j > 0 ) yans[ypos--] = ( int )y.charAt(--j);
else yans[ypos--] = ( int ) '_' ;
}
// Since we have assumed the
// answer to be n+m long,
// we need to remove the extra
// gaps in the starting id
// represents the index from
// which the arrays xans,
// yans are useful
int id = 1 ;
for (i = l; i >= 1 ; i--)
{
if (( char )yans[i] == '_' &&
( char )xans[i] == '_' )
{
id = i + 1 ;
break ;
}
}
// Printing the final answer
System.out.print( "Minimum Penalty in " +
"aligning the genes = " );
System.out.print(dp[m][n] + "" );
System.out.println( "The aligned genes are :" );
for (i = id; i <= l; i++)
{
System.out.print(( char )xans[i]);
}
System.out.print( "" );
for (i = id; i <= l; i++)
{
System.out.print(( char )yans[i]);
}
return ;
}
// Driver code
public static void main(String[] args)
{
// input strings
String gene1 = "AGGGCT" ;
String gene2 = "AGGCA" ;
// initialising penalties
// of different types
int misMatchPenalty = 3 ;
int gapPenalty = 2 ;
// calling the function to
// calculate the result
getMinimumPenalty(gene1, gene2,
misMatchPenalty, gapPenalty);
}
}


Python3

#!/bin/python3
"""
Converted from C++ solution to Python3
Algorithm type / application: Bioinformatics
Python Requirements:
numpy
"""
import numpy as np
def get_minimum_penalty(x: str , y: str , pxy: int , pgap: int ):
"""
Function to find out the minimum penalty
:param x: pattern X
:param y: pattern Y
:param pxy: penalty of mis-matching the characters of X and Y
:param pgap: penalty of a gap between pattern elements
"""
# initializing variables
i = 0
j = 0
# pattern lengths
m = len (x)
n = len (y)
# table for storing optimal substructure answers
dp = np.zeros([m + 1 ,n + 1 ], dtype = int ) #int dp[m+1][n+1] = {0};
# initialising the table
dp[ 0 :(m + 1 ), 0 ] = [ i * pgap for i in range (m + 1 )]
dp[ 0 , 0 :(n + 1 )] = [ i * pgap for i in range (n + 1 )]
# calculating the minimum penalty
i = 1
while i < = m:
j = 1
while j < = n:
if x[i - 1 ] = = y[j - 1 ]:
dp[i][j] = dp[i - 1 ][j - 1 ]
else :
dp[i][j] = min (dp[i - 1 ][j - 1 ] + pxy,
dp[i - 1 ][j] + pgap,
dp[i][j - 1 ] + pgap)
j + = 1
i + = 1
# Reconstructing the solution
l = n + m # maximum possible length
i = m
j = n
xpos = l
ypos = l
# Final answers for the respective strings
xans = np.zeros(l + 1 , dtype = int )
yans = np.zeros(l + 1 , dtype = int )
while not (i = = 0 or j = = 0 ):
#print(f"i: {i}, j: {j}")
if x[i - 1 ] = = y[j - 1 ]:
xans[xpos] = ord (x[i - 1 ])
yans[ypos] = ord (y[j - 1 ])
xpos - = 1
ypos - = 1
i - = 1
j - = 1
elif (dp[i - 1 ][j - 1 ] + pxy) = = dp[i][j]:
xans[xpos] = ord (x[i - 1 ])
yans[ypos] = ord (y[j - 1 ])
xpos - = 1
ypos - = 1
i - = 1
j - = 1
elif (dp[i - 1 ][j] + pgap) = = dp[i][j]:
xans[xpos] = ord (x[i - 1 ])
yans[ypos] = ord ( '_' )
xpos - = 1
ypos - = 1
i - = 1
elif (dp[i][j - 1 ] + pgap) = = dp[i][j]:
xans[xpos] = ord ( '_' )
yans[ypos] = ord (y[j - 1 ])
xpos - = 1
ypos - = 1
j - = 1
while xpos > 0 :
if i > 0 :
i - = 1
xans[xpos] = ord (x[i])
xpos - = 1
else :
xans[xpos] = ord ( '_' )
xpos - = 1
while ypos > 0 :
if j > 0 :
j - = 1
yans[ypos] = ord (y[j])
ypos - = 1
else :
yans[ypos] = ord ( '_' )
ypos - = 1
# Since we have assumed the answer to be n+m long,
# we need to remove the extra gaps in the starting
# id represents the index from which the arrays
# xans, yans are useful
id = 1
i = l
while i > = 1 :
if ( chr (yans[i]) = = '_' ) and chr (xans[i]) = = '_' :
id = i + 1
break
i - = 1
# Printing the final answer
print (f "Minimum Penalty in aligning the genes = {dp[m][n]}" )
print ( "The aligned genes are:" )
# X
i = id
x_seq = ""
while i < = l:
x_seq + = chr (xans[i])
i + = 1
print (f "X seq: {x_seq}" )
# Y
i = id
y_seq = ""
while i < = l:
y_seq + = chr (yans[i])
i + = 1
print (f "Y seq: {y_seq}" )
def test_get_minimum_penalty():
"""
Test the get_minimum_penalty function
"""
# input strings
gene1 = "AGGGCT"
gene2 = "AGGCA"
# initialising penalties of different types
mismatch_penalty = 3
gap_penalty = 2
# calling the function to calculate the result
get_minimum_penalty(gene1, gene2, mismatch_penalty, gap_penalty)
test_get_minimum_penalty()
# This code is contributed by wilderchirstopher.


C#

// C# program to implement sequence alignment
// problem.
using System;
class GFG
{
// function to find out the minimum penalty
public static void getMinimumPenalty( string x, string y, int pxy, int pgap)
{
int i, j; // initialising variables
int m = x.Length; // length of gene1
int n = y.Length; // length of gene2
// table for storing optimal substructure answers
int [,] dp = new int [n+m+1,n+m+1];
for ( int q = 0; q < n+m+1; q++)
for ( int w = 0; w < n+m+1; w++)
dp[q,w] = 0;
// initialising the table
for (i = 0; i <= (n+m); i++)
{
dp[i,0] = i * pgap;
dp[0,i] = i * pgap;
}
// calculating the minimum penalty
for (i = 1; i <= m; i++)
{
for (j = 1; j <= n; j++)
{
if (x[i - 1] == y[j - 1])
{
dp[i,j] = dp[i - 1,j - 1];
}
else
{
dp[i,j] = Math.Min(Math.Min(dp[i - 1,j - 1] + pxy ,
dp[i - 1,j] + pgap)    ,
dp[i,j - 1] + pgap    );
}
}
}
// Reconstructing the solution
int l = n + m; // maximum possible length
i = m; j = n;
int xpos = l;
int ypos = l;
// Final answers for the respective strings
int [] xans = new int [l+1];
int [] yans = new int [l+1];
while ( !(i == 0 || j == 0))
{
if (x[i - 1] == y[j - 1])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int )y[j - 1];
i--; j--;
}
else if (dp[i - 1,j - 1] + pxy == dp[i,j])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int )y[j - 1];
i--; j--;
}
else if (dp[i - 1,j] + pgap == dp[i,j])
{
xans[xpos--] = ( int )x[i - 1];
yans[ypos--] = ( int ) '_' ;
i--;
}
else if (dp[i,j - 1] + pgap == dp[i,j])
{
xans[xpos--] = ( int ) '_' ;
yans[ypos--] = ( int )y[j - 1];
j--;
}
}
while (xpos > 0)
{
if (i > 0) xans[xpos--] = ( int )x[--i];
else xans[xpos--] = ( int ) '_' ;
}
while (ypos > 0)
{
if (j > 0) yans[ypos--] = ( int )y[--j];
else yans[ypos--] = ( int ) '_' ;
}
// Since we have assumed the answer to be n+m long,
// we need to remove the extra gaps in the starting
// id represents the index from which the arrays
// xans, yans are useful
int id = 1;
for (i = l; i >= 1; i--)
{
if (( char )yans[i] == '_' && ( char )xans[i] == '_' )
{
id = i + 1;
break ;
}
}
// Printing the final answer
Console.Write( "Minimum Penalty in aligning the genes = " + dp[m,n] + "" );
Console.Write( "The aligned genes are :" );
for (i = id; i <= l; i++)
{
Console.Write(( char )xans[i]);
}
Console.Write( "" );
for (i = id; i <= l; i++)
{
Console.Write(( char )yans[i]);
}
return ;
}
// Driver code
static void Main()
{
// input strings
string gene1 = "AGGGCT" ;
string gene2 = "AGGCA" ;
// initialising penalties of different types
int misMatchPenalty = 3;
int gapPenalty = 2;
// calling the function to calculate the result
getMinimumPenalty(gene1, gene2,
misMatchPenalty, gapPenalty);
}
//This code is contributed by DrRoot_
}


PHP

<?php
// PHP program to implement
// sequence alignment problem.
// function to find out
// the minimum penalty
function getMinimumPenalty( $x , $y ,
$pxy , $pgap )
{
$i ; $j ; // initializing variables
$m = strlen ( $x ); // length of gene1
$n = strlen ( $y ); // length of gene2
// table for storing optimal
// substructure answers
$dp [ $n + $m + 1][ $n + $m + 1] = array (0);
// initialising the table
for ( $i = 0; $i <= ( $n + $m ); $i ++)
{
$dp [ $i ][0] = $i * $pgap ;
$dp [0][ $i ] = $i * $pgap ;
}
// calculating the
// minimum penalty
for ( $i = 1; $i <= $m ; $i ++)
{
for ( $j = 1; $j <= $n ; $j ++)
{
if ( $x [ $i - 1] == $y [ $j - 1])
{
$dp [ $i ][ $j ] = $dp [ $i - 1][ $j - 1];
}
else
{
$dp [ $i ][ $j ] = min( $dp [ $i - 1][ $j - 1] + $pxy ,
$dp [ $i - 1][ $j ] + $pgap ,
$dp [ $i ][ $j - 1] + $pgap );
}
}
}
// Reconstructing the solution
$l = $n + $m ; // maximum possible length
$i = $m ; $j = $n ;
$xpos = $l ;
$ypos = $l ;
// Final answers for
// the respective strings
// $xans[$l + 1]; $yans[$l + 1];
while ( !( $i == 0 || $j == 0))
{
if ( $x [ $i - 1] == $y [ $j - 1])
{
$xans [ $xpos --] = $x [ $i - 1];
$yans [ $ypos --] = $y [ $j - 1];
$i --; $j --;
}
else if ( $dp [ $i - 1][ $j - 1] +
$pxy == $dp [ $i ][ $j ])
{
$xans [ $xpos --] = $x [ $i - 1];
$yans [ $ypos --] = $y [ $j - 1];
$i --; $j --;
}
else if ( $dp [ $i - 1][ $j ] +
$pgap == $dp [ $i ][ $j ])
{
$xans [ $xpos --] = $x [ $i - 1];
$yans [ $ypos --] = '_' ;
$i --;
}
else if ( $dp [ $i ][ $j - 1] +
$pgap == $dp [ $i ][ $j ])
{
$xans [ $xpos --] = '_' ;
$yans [ $ypos --] = $y [ $j - 1];
$j --;
}
}
while ( $xpos > 0)
{
if ( $i > 0) $xans [ $xpos --] = $x [-- $i ];
else $xans [ $xpos --] = '_' ;
}
while ( $ypos > 0)
{
if ( $j > 0)
$yans [ $ypos --] = $y [-- $j ];
else
$yans [ $ypos --] = '_' ;
}
// Since we have assumed the
// answer to be n+m long,
// we need to remove the extra
// gaps in the starting
// id represents the index
// from which the arrays
// xans, yans are useful
$id = 1;
for ( $i = $l ; $i >= 1; $i --)
{
if ( $yans [ $i ] == '_' &&
$xans [ $i ] == '_' )
{
$id = $i + 1;
break ;
}
}
// Printing the final answer
echo "Minimum Penalty in " .
"aligning the genes = " ;
echo $dp [ $m ][ $n ] . "" ;
echo "The aligned genes are :" ;
for ( $i = $id ; $i <= $l ; $i ++)
{
echo $xans [ $i ];
}
echo "" ;
for ( $i = $id ; $i <= $l ; $i ++)
{
echo $yans [ $i ];
}
return ;
}
// Driver code
// input strings
$gene1 = "AGGGCT" ;
$gene2 = "AGGCA" ;
// initialising penalties
// of different types
$misMatchPenalty = 3;
$gapPenalty = 2;
// calling the function
// to calculate the result
getMinimumPenalty( $gene1 , $gene2 ,
$misMatchPenalty , $gapPenalty );
// This code is contributed by Abhinav96
?>


输出:

Minimum Penalty in aligning the genes = 5The aligned genes are :AGGGCTA_GGCA

时间复杂性: mathcal{O}(n*m)        空间复杂性: mathcal{O}(n*m)

© 版权声明
THE END
喜欢就支持一下吧
点赞5 分享