1
2
3
4
5
6
7
8
9
10
11 import re
12
13 import transsys
14 import transsys.utils
15
16
18 i = 'acgt'.find(b)
19 if i == -1 :
20 raise StandardError, 'bad base "%s"' % b
21 return i
22
23
25 s = float(sum(c))
26 return map(lambda x : x / s, c)
27
28
30
31 - def __init__(self, protein, position, strength) :
32 self.protein = protein
33 self.position = position
34 self.strength = strength
35
36
38 return (' ' * self.position) + ('^' * len(self.protein.matrix)) + (' %s @ %d (%4.2f)' % (self.protein.name, self.position, self.strength))
39
40
42 """Represents a DNA binding factor using a scoring matrix and a threshold."""
43
44 - def __init__(self, name, dna, thresholdBase, thresholdIncrement) :
45 self.name = name
46 self.decode(dna, thresholdBase, thresholdIncrement)
47
48
49 - def decode(self, dna, thresholdBase, thresholdIncrement) :
50 """Decode a string of dna bases to a matrix and a threshold.
51 The matrix is decoded by taking words of length 5 from the encoding
52 sequence. If the first four characters are all identical, decoding
53 stops. Otherwise, they are mapped to integer values in {0, 1, 2, 3}
54 and appended to the matrix as a further column. The 5th character is
55 mapped to an integer, which is and added to the threshold. The threshold
56 is further controlled by the threshold offset, which is also added to
57 the threshold for each position in the matrix, thus allowing some
58 control over the binder's specificity."""
59 self.threshold = thresholdBase
60 self.matrix = []
61 i = 0
62 while len(dna) - i >= 5 :
63 c = map(baseToInt, dna[i:i + 4])
64 if c[0] == c[1] and c[0] == c[2] and c[0] == c[3] :
65 i = i + 4
66 break
67 self.matrix.append(normalisedColumn(c))
68 self.threshold = self.threshold + baseToInt(dna[i + 4]) * thresholdIncrement
69 i = i + 5
70 self.encoding_sequence = dna[:i]
71
72
74
75 sa = 'a '
76 sc = 'c '
77 sg = 'g '
78 st = 't '
79 for c in self.matrix :
80 sa = sa + ('%5.2f' % c[0])
81 sc = sc + ('%5.2f' % c[1])
82 sg = sg + ('%5.2f' % c[2])
83 st = st + ('%5.2f' % c[3])
84 return 'DNABinder %s, threshold = %5.1f, max. score = %5.1f\n%s\n%s\n%s\n%s\nencoded by: %s' % (self.name, self.threshold, self.max_score(), sa, sc, sg, st, self.encoding_sequence)
85
86
88 s = 0.0
89 for c in self.matrix :
90 s = s + max(c)
91 return s
92
93
95 e = 0.0
96 if len(seq) < len(self.matrix) :
97 return e
98 for i in xrange(len(self.matrix)) :
99 e = e + self.matrix[i][baseToInt(seq[i])]
100 return e
101
102
104 if len(self.matrix) == 0 :
105 return None
106 e = self.bindingEnergy(seq[position:])
107 if e < self.threshold :
108 return None
109 else :
110 if self.threshold > 0.0 :
111 strength = e / self.threshold
112 else :
113 strength = 1.0
114 return BindingSite(self, position, strength)
115
116
118 blist = []
119 for i in xrange(len(seq) - 1) :
120 for p in proteome :
121 bs = p.bindingSite(seq, i)
122 if bs is not None :
123 blist.append(bs)
124 return blist
125
126
128 """Represents a raw gene, i.e. the sequences of the gene's elements, which
129 are the gene start (promoter), the activator binding area, the repressor binding
130 area, the structural area, and the end (transcriptional terminator). The gene is
131 'raw' in the sense that this class holds just the sequences, not any data
132 structures derived from them, such as DNABinder etc."""
133 - def __init__(self, position, gene_name, product_name, geneStart, geneEnd, activatorArea, repressorArea, structuralArea) :
134 self.position = position
135 self.gene_name = gene_name
136 self.product_name = product_name
137 self.activatorArea = activatorArea
138 self.repressorArea = repressorArea
139 self.structuralArea = structuralArea
140 self.geneStart = geneStart
141 self.geneEnd = geneEnd
142
143
146
147
149 return self.geneStart + self.promoterArea() + self.structuralArea + self.geneEnd
150
151
153 """Decodes genome sequences into transsys programs.
154 The overall gene structure is::
155
156 ...|----activator area-----|----repressor area ----|-geneStart-|--structural area--|-geneEnd...
157 |<-activatorAreaLength->|<-repressorAreaLength->|
158
159 The structural area length is variable. Gene start and end can also be variable,
160 as enabled by regular expressions.
161
162 This decoder uses the following constants:
163 - geneStartRE, geneEndRE: Regular expressions to determine start and end of
164 a gene
165 - repressorAreaLength: length of area where binding represses
166 - activationAreaLength: length of area where binding activates
167 - decay, diffusibility: constants for factor construction
168 - a_spec, a_max: constants for promoter element construction
169 (used for both activate and repress elements)
170 - constitutive: constant for constitutive promoter element construction
171 Notice that strength of activation / repression does not depend on binding
172 strength in this decoder."""
173
174 savefile_magic = 'TranssysDNADecoderParameters-1.3'
175
177 self.thresholdBase = None
178 self.trhesholdIncrement = None
179 self.factorNameTemplate = None
180 self.geneNameTemplate = None
181 self.geneStartRE = None
182 self.geneEndRE = None
183 self.repressorAreaLength = None
184 self.activatorAreaLength = None
185 self.decay = None
186 self.diffusibility = None
187 self.a_spec = None
188 self.a_max = None
189 self.constitutive = None
190
191
193 s = 'thresholdBase: %g\n' % self.thresholdBase
194 s = s + 'thresholdIncrement: %g\n' % self.thresholdIncrement
195 s = s + 'factorNameTemplate: %s\n' % self.factorNameTemplate
196 s = s + 'geneNameTemplate: %s\n' % self.geneNameTemplate
197 s = s + 'geneStartRE: %s\n' % self.geneStartRE.pattern
198 s = s + 'geneEndRE: %s\n' % self.geneEndRE.pattern
199 s = s + 'repressorAreaLength: %d\n' % self.repressorAreaLength
200 s = s + 'activatorAreaLength: %d\n' % self.activatorAreaLength
201 s = s + 'decay: %g\n' % self.decay
202 s = s + 'diffusibility: %g\n' % self.diffusibility
203 s = s + 'a_spec: %g\n' % self.a_spec
204 s = s + 'a_max: %g\n' % self.a_max
205 s = s + 'constitutive: %g' % self.constitutive
206 return s
207
208
213
214
216 line = f.readline()
217 if line.strip() != self.savefile_magic :
218 raise StandardError, 'TranssysDNADecode::parse: bad magic "%s"' % line.strip()
219 self.thresholdBase = transsys.utils.parse_float(f, 'thresholdBase')
220 self.thresholdIncrement = transsys.utils.parse_float(f, 'thresholdIncrement')
221 self.factorNameTemplate = transsys.utils.parse_string(f, 'factorNameTemplate').strip()
222 self.geneNameTemplate = transsys.utils.parse_string(f, 'geneNameTemplate').strip()
223 self.setGeneStartRE(transsys.utils.parse_string(f, 'geneStartRE').strip())
224 self.setGeneEndRE(transsys.utils.parse_string(f, 'geneEndRE').strip())
225 self.repressorAreaLength = transsys.utils.parse_int(f, 'repressorAreaLength')
226 self.activatorAreaLength = transsys.utils.parse_int(f, 'activatorAreaLength')
227 self.decay = transsys.utils.parse_float(f, 'decay')
228 self.diffusibility = transsys.utils.parse_float(f, 'diffusibility')
229 self.a_spec = transsys.utils.parse_float(f, 'a_spec')
230 self.a_max = transsys.utils.parse_float(f, 'a_max')
231 self.constitutive = transsys.utils.parse_float(f, 'constitutive')
232
233
235 self.geneStartRE = re.compile(r)
236
237
239 self.geneEndRE = re.compile(r)
240
241
243 """separate a genome into raw genes."""
244
245 raw_gene_list = []
246 m = self.geneStartRE.search(genome)
247 while m :
248 position = m.start()
249 geneStart = m.group()
250 a_start = position + len(geneStart)
251 a_end = a_start + self.activatorAreaLength
252 r_start = a_end
253 r_end = r_start + self.repressorAreaLength
254 s_start = r_end
255 m = self.geneEndRE.search(genome, s_start)
256 if m :
257 s_end = m.start()
258 geneEnd = m.group()
259 else :
260 s_end = len(genome)
261 geneEnd = '*'
262 activatorArea = genome[a_start:a_end]
263 repressorArea = genome[r_start:r_end]
264 structuralArea = genome[s_start:s_end]
265 gene_name = self.geneNameTemplate % position
266 product_name = self.factorNameTemplate % position
267 raw_gene = RawDNAGene(position, gene_name, product_name, geneStart, geneEnd, activatorArea, repressorArea, structuralArea)
268 raw_gene_list.append(raw_gene)
269 m = self.geneStartRE.search(genome, position + 1)
270 return raw_gene_list
271
272
274 """construct a transsys program by decoding a genome sequence. The
275 process is: (1) split the genome into sequence portions that represent
276 genes, based on the gene start and end regular expressions. (2) For each
277 Gene, translate the structural parts of a gene into a DNABinder; the set
278 of all DNABinders constitutes the proteome. (3) For each gene, determine
279 the multiset of DNABinders that bind in the activating and the repressing
280 regions, respectively. (4) Construct a transsys program of the genes and
281 the proteome found, with promoters constructed based on (3)."""
282 raw_gene_list = self.rawDNAGenes(genome)
283 proteome = []
284 factor_list = []
285 for rg in raw_gene_list :
286 p = DNABinder(rg.product_name, rg.structuralArea, self.thresholdBase, self.thresholdIncrement)
287 proteome.append(p)
288 decay_expr = transsys.ExpressionNodeValue(self.decay)
289 diffusibility_expr = transsys.ExpressionNodeValue(self.diffusibility)
290 factor = transsys.Factor(rg.product_name, decay_expr, diffusibility_expr)
291 factor.comments.append('decoded to DNABinder:')
292 for l in str(p).split('\n') :
293 factor.comments.append(l)
294 factor_list.append(factor)
295
296
297 gene_list = []
298 for rg in raw_gene_list :
299 activators = find_binding_sites(proteome, rg.activatorArea)
300 repressors = find_binding_sites(proteome, rg.repressorArea)
301 promoter = [transsys.PromoterElementConstitutive(transsys.ExpressionNodeValue(self.constitutive))]
302 for a in activators :
303 a_spec = transsys.ExpressionNodeValue(self.a_spec)
304 a_max = transsys.ExpressionNodeValue(self.a_max)
305 promoter.append(transsys.PromoterElementActivate(a_spec, a_max, [a.protein.name]))
306 for r in repressors :
307 r_spec = transsys.ExpressionNodeValue(self.a_spec)
308 r_max = transsys.ExpressionNodeValue(self.a_max)
309 promoter.append(transsys.PromoterElementRepress(r_spec, r_max, [r.protein.name]))
310 gene = transsys.Gene(rg.gene_name, rg.product_name, promoter)
311 gene.comments.append('gene: %s' % rg.rawSequence())
312 gene.comments.append('promoter area: %s' % (' ' * len(rg.geneStart)) + rg.promoterArea())
313 gene.comments.append('structural area: %s' % (' ' * (len(rg.geneStart) + len(rg.promoterArea())) + rg.structuralArea))
314
315 gene.comments.append('activator area:')
316 gene.comments.append(rg.activatorArea)
317 for a in activators :
318 gene.comments.append(a.arrowLine())
319 gene.comments.append('repressor area:')
320 gene.comments.append(rg.repressorArea)
321 for r in repressors :
322 gene.comments.append(r.arrowLine())
323 gene_list.append(gene)
324 return transsys.TranssysProgram(transsys_name, factor_list, gene_list, resolve = True)
325