Module transdecode
[hide private]
[frames] | no frames]

Source Code for Module transdecode

  1  # $Id: transdecode.py 143 2006-08-02 19:34:34Z jtk $ 
  2   
  3  # $Log$ 
  4  # Revision 1.2  2005/06/15 10:56:12  jtk 
  5  # changed gene structure, introduced regexp use for start/end finding 
  6  # 
  7  # Revision 1.1  2005/06/14 18:35:42  jtk 
  8  # transdecode for decoding transsys programs from "DNA" sequences 
  9  # 
 10   
 11  import re 
 12   
 13  import transsys 
 14  import transsys.utils 
 15   
 16   
17 -def baseToInt(b) :
18 i = 'acgt'.find(b) 19 if i == -1 : 20 raise StandardError, 'bad base "%s"' % b 21 return i
22 23
24 -def normalisedColumn(c) :
25 s = float(sum(c)) 26 return map(lambda x : x / s, c)
27 28
29 -class BindingSite :
30
31 - def __init__(self, protein, position, strength) :
32 self.protein = protein 33 self.position = position 34 self.strength = strength
35 36
37 - def arrowLine(self) :
38 return (' ' * self.position) + ('^' * len(self.protein.matrix)) + (' %s @ %d (%4.2f)' % (self.protein.name, self.position, self.strength))
39 40
41 -class DNABinder :
42 """Represents a DNA binding factor using a scoring matrix and a threshold.""" 43
44 - def __init__(self, name, dna, thresholdBase, thresholdIncrement) :
45 self.name = name 46 self.decode(dna, thresholdBase, thresholdIncrement)
47 48
49 - def decode(self, dna, thresholdBase, thresholdIncrement) :
50 """Decode a string of dna bases to a matrix and a threshold. 51 The matrix is decoded by taking words of length 5 from the encoding 52 sequence. If the first four characters are all identical, decoding 53 stops. Otherwise, they are mapped to integer values in {0, 1, 2, 3} 54 and appended to the matrix as a further column. The 5th character is 55 mapped to an integer, which is and added to the threshold. The threshold 56 is further controlled by the threshold offset, which is also added to 57 the threshold for each position in the matrix, thus allowing some 58 control over the binder's specificity.""" 59 self.threshold = thresholdBase 60 self.matrix = [] 61 i = 0 62 while len(dna) - i >= 5 : 63 c = map(baseToInt, dna[i:i + 4]) 64 if c[0] == c[1] and c[0] == c[2] and c[0] == c[3] : 65 i = i + 4 66 break 67 self.matrix.append(normalisedColumn(c)) 68 self.threshold = self.threshold + baseToInt(dna[i + 4]) * thresholdIncrement 69 i = i + 5 70 self.encoding_sequence = dna[:i]
71 72
73 - def __str__(self) :
74 75 sa = 'a ' 76 sc = 'c ' 77 sg = 'g ' 78 st = 't ' 79 for c in self.matrix : 80 sa = sa + ('%5.2f' % c[0]) 81 sc = sc + ('%5.2f' % c[1]) 82 sg = sg + ('%5.2f' % c[2]) 83 st = st + ('%5.2f' % c[3]) 84 return 'DNABinder %s, threshold = %5.1f, max. score = %5.1f\n%s\n%s\n%s\n%s\nencoded by: %s' % (self.name, self.threshold, self.max_score(), sa, sc, sg, st, self.encoding_sequence)
85 86
87 - def max_score(self) :
88 s = 0.0 89 for c in self.matrix : 90 s = s + max(c) 91 return s
92 93
94 - def bindingEnergy(self, seq) :
95 e = 0.0 96 if len(seq) < len(self.matrix) : 97 return e 98 for i in xrange(len(self.matrix)) : 99 e = e + self.matrix[i][baseToInt(seq[i])] 100 return e
101 102
103 - def bindingSite(self, seq, position = 0) :
104 if len(self.matrix) == 0 : 105 return None 106 e = self.bindingEnergy(seq[position:]) 107 if e < self.threshold : 108 return None 109 else : 110 if self.threshold > 0.0 : 111 strength = e / self.threshold 112 else : 113 strength = 1.0 114 return BindingSite(self, position, strength)
115 116
117 -def find_binding_sites(proteome, seq) :
118 blist = [] 119 for i in xrange(len(seq) - 1) : 120 for p in proteome : 121 bs = p.bindingSite(seq, i) 122 if bs is not None : 123 blist.append(bs) 124 return blist
125 126
127 -class RawDNAGene :
128 """Represents a raw gene, i.e. the sequences of the gene's elements, which 129 are the gene start (promoter), the activator binding area, the repressor binding 130 area, the structural area, and the end (transcriptional terminator). The gene is 131 'raw' in the sense that this class holds just the sequences, not any data 132 structures derived from them, such as DNABinder etc."""
133 - def __init__(self, position, gene_name, product_name, geneStart, geneEnd, activatorArea, repressorArea, structuralArea) :
134 self.position = position 135 self.gene_name = gene_name 136 self.product_name = product_name 137 self.activatorArea = activatorArea 138 self.repressorArea = repressorArea 139 self.structuralArea = structuralArea 140 self.geneStart = geneStart 141 self.geneEnd = geneEnd
142 143
144 - def promoterArea(self) :
145 return self.activatorArea + self.repressorArea
146 147
148 - def rawSequence(self) :
149 return self.geneStart + self.promoterArea() + self.structuralArea + self.geneEnd
150 151
152 -class TranssysDNADecoder :
153 """Decodes genome sequences into transsys programs. 154 The overall gene structure is:: 155 156 ...|----activator area-----|----repressor area ----|-geneStart-|--structural area--|-geneEnd... 157 |<-activatorAreaLength->|<-repressorAreaLength->| 158 159 The structural area length is variable. Gene start and end can also be variable, 160 as enabled by regular expressions. 161 162 This decoder uses the following constants: 163 - geneStartRE, geneEndRE: Regular expressions to determine start and end of 164 a gene 165 - repressorAreaLength: length of area where binding represses 166 - activationAreaLength: length of area where binding activates 167 - decay, diffusibility: constants for factor construction 168 - a_spec, a_max: constants for promoter element construction 169 (used for both activate and repress elements) 170 - constitutive: constant for constitutive promoter element construction 171 Notice that strength of activation / repression does not depend on binding 172 strength in this decoder.""" 173 174 savefile_magic = 'TranssysDNADecoderParameters-1.3' 175
176 - def __init__(self) :
177 self.thresholdBase = None 178 self.trhesholdIncrement = None 179 self.factorNameTemplate = None 180 self.geneNameTemplate = None 181 self.geneStartRE = None 182 self.geneEndRE = None 183 self.repressorAreaLength = None 184 self.activatorAreaLength = None 185 self.decay = None 186 self.diffusibility = None 187 self.a_spec = None 188 self.a_max = None 189 self.constitutive = None
190 191
192 - def __str__(self) :
193 s = 'thresholdBase: %g\n' % self.thresholdBase 194 s = s + 'thresholdIncrement: %g\n' % self.thresholdIncrement 195 s = s + 'factorNameTemplate: %s\n' % self.factorNameTemplate 196 s = s + 'geneNameTemplate: %s\n' % self.geneNameTemplate 197 s = s + 'geneStartRE: %s\n' % self.geneStartRE.pattern 198 s = s + 'geneEndRE: %s\n' % self.geneEndRE.pattern 199 s = s + 'repressorAreaLength: %d\n' % self.repressorAreaLength 200 s = s + 'activatorAreaLength: %d\n' % self.activatorAreaLength 201 s = s + 'decay: %g\n' % self.decay 202 s = s + 'diffusibility: %g\n' % self.diffusibility 203 s = s + 'a_spec: %g\n' % self.a_spec 204 s = s + 'a_max: %g\n' % self.a_max 205 s = s + 'constitutive: %g' % self.constitutive 206 return s
207 208
209 - def write(self, f) :
210 f.write('%s\n' % self.savefile_magic) 211 f.write(str(self)) 212 f.write('\n')
213 214
215 - def parse(self, f) :
216 line = f.readline() 217 if line.strip() != self.savefile_magic : 218 raise StandardError, 'TranssysDNADecode::parse: bad magic "%s"' % line.strip() 219 self.thresholdBase = transsys.utils.parse_float(f, 'thresholdBase') 220 self.thresholdIncrement = transsys.utils.parse_float(f, 'thresholdIncrement') 221 self.factorNameTemplate = transsys.utils.parse_string(f, 'factorNameTemplate').strip() 222 self.geneNameTemplate = transsys.utils.parse_string(f, 'geneNameTemplate').strip() 223 self.setGeneStartRE(transsys.utils.parse_string(f, 'geneStartRE').strip()) 224 self.setGeneEndRE(transsys.utils.parse_string(f, 'geneEndRE').strip()) 225 self.repressorAreaLength = transsys.utils.parse_int(f, 'repressorAreaLength') 226 self.activatorAreaLength = transsys.utils.parse_int(f, 'activatorAreaLength') 227 self.decay = transsys.utils.parse_float(f, 'decay') 228 self.diffusibility = transsys.utils.parse_float(f, 'diffusibility') 229 self.a_spec = transsys.utils.parse_float(f, 'a_spec') 230 self.a_max = transsys.utils.parse_float(f, 'a_max') 231 self.constitutive = transsys.utils.parse_float(f, 'constitutive')
232 233
234 - def setGeneStartRE(self, r) :
235 self.geneStartRE = re.compile(r)
236 237
238 - def setGeneEndRE(self, r) :
239 self.geneEndRE = re.compile(r)
240 241
242 - def rawDNAGenes(self, genome) :
243 """separate a genome into raw genes.""" 244 # print 'gene_index_list(%s, %s)' % (genome, gene_refpoint_tag) 245 raw_gene_list = [] 246 m = self.geneStartRE.search(genome) 247 while m : 248 position = m.start() 249 geneStart = m.group() 250 a_start = position + len(geneStart) 251 a_end = a_start + self.activatorAreaLength 252 r_start = a_end 253 r_end = r_start + self.repressorAreaLength 254 s_start = r_end 255 m = self.geneEndRE.search(genome, s_start) 256 if m : 257 s_end = m.start() 258 geneEnd = m.group() 259 else : 260 s_end = len(genome) 261 geneEnd = '*' 262 activatorArea = genome[a_start:a_end] 263 repressorArea = genome[r_start:r_end] 264 structuralArea = genome[s_start:s_end] 265 gene_name = self.geneNameTemplate % position 266 product_name = self.factorNameTemplate % position 267 raw_gene = RawDNAGene(position, gene_name, product_name, geneStart, geneEnd, activatorArea, repressorArea, structuralArea) 268 raw_gene_list.append(raw_gene) 269 m = self.geneStartRE.search(genome, position + 1) 270 return raw_gene_list
271 272
273 - def decode_transsys(self, transsys_name, genome) :
274 """construct a transsys program by decoding a genome sequence. The 275 process is: (1) split the genome into sequence portions that represent 276 genes, based on the gene start and end regular expressions. (2) For each 277 Gene, translate the structural parts of a gene into a DNABinder; the set 278 of all DNABinders constitutes the proteome. (3) For each gene, determine 279 the multiset of DNABinders that bind in the activating and the repressing 280 regions, respectively. (4) Construct a transsys program of the genes and 281 the proteome found, with promoters constructed based on (3).""" 282 raw_gene_list = self.rawDNAGenes(genome) 283 proteome = [] 284 factor_list = [] 285 for rg in raw_gene_list : 286 p = DNABinder(rg.product_name, rg.structuralArea, self.thresholdBase, self.thresholdIncrement) 287 proteome.append(p) 288 decay_expr = transsys.ExpressionNodeValue(self.decay) 289 diffusibility_expr = transsys.ExpressionNodeValue(self.diffusibility) 290 factor = transsys.Factor(rg.product_name, decay_expr, diffusibility_expr) 291 factor.comments.append('decoded to DNABinder:') 292 for l in str(p).split('\n') : 293 factor.comments.append(l) 294 factor_list.append(factor) 295 # print '%d: struct. area = "%s"' % (i, structArea) 296 # print p 297 gene_list = [] 298 for rg in raw_gene_list : 299 activators = find_binding_sites(proteome, rg.activatorArea) 300 repressors = find_binding_sites(proteome, rg.repressorArea) 301 promoter = [transsys.PromoterElementConstitutive(transsys.ExpressionNodeValue(self.constitutive))] 302 for a in activators : 303 a_spec = transsys.ExpressionNodeValue(self.a_spec) 304 a_max = transsys.ExpressionNodeValue(self.a_max) 305 promoter.append(transsys.PromoterElementActivate(a_spec, a_max, [a.protein.name])) 306 for r in repressors : 307 r_spec = transsys.ExpressionNodeValue(self.a_spec) 308 r_max = transsys.ExpressionNodeValue(self.a_max) 309 promoter.append(transsys.PromoterElementRepress(r_spec, r_max, [r.protein.name])) 310 gene = transsys.Gene(rg.gene_name, rg.product_name, promoter) 311 gene.comments.append('gene: %s' % rg.rawSequence()) 312 gene.comments.append('promoter area: %s' % (' ' * len(rg.geneStart)) + rg.promoterArea()) 313 gene.comments.append('structural area: %s' % (' ' * (len(rg.geneStart) + len(rg.promoterArea())) + rg.structuralArea)) 314 315 gene.comments.append('activator area:') 316 gene.comments.append(rg.activatorArea) 317 for a in activators : 318 gene.comments.append(a.arrowLine()) 319 gene.comments.append('repressor area:') 320 gene.comments.append(rg.repressorArea) 321 for r in repressors : 322 gene.comments.append(r.arrowLine()) 323 gene_list.append(gene) 324 return transsys.TranssysProgram(transsys_name, factor_list, gene_list, resolve = True)
325