Module arrtool
[hide private]
[frames] | no frames]

Source Code for Module arrtool

  1  #!/usr/bin/env python 
  2   
  3  # $Id: arrtool.py 2 2005-03-08 17:12:01Z jtk $ 
  4   
  5  # $Log$ 
  6  # Revision 1.1  2005/03/08 17:12:02  jtk 
  7  # Initial revision 
  8  # 
  9  # Revision 1.3  2003/02/14 16:31:27  kim 
 10  # regstruct project is nearing completion. regstruct_transsys for 
 11  #     generation of transsys programs and regstruct_transarray for 
 12  #     array simulation are (more or less) properly implemented. 
 13  #     Analysis with R has been assembled in microarr.r 
 14  # 
 15  # Revision 1.2  2003/02/04 23:42:17  kim 
 16  # made experiment_plots function from transarray a method of MicroarrayData 
 17  # 
 18  # Revision 1.1  2003/01/28 21:12:05  kim 
 19  # initial toolbox assembly 
 20  # 
 21   
 22   
 23  import sys 
 24  import string 
 25  import types 
 26  import re 
 27  import copy 
 28   
 29   
30 -def cmp_by_score(x1, x2) :
31 if x1[1] < x2[1] : 32 return -1 33 elif x1[1] > x2[1] : 34 return 1 35 return 0
36 37
38 -class ExperimentSpec :
39
40 - def __init__(self, col_index, time_list, title = '(untitled)') :
41 if len(col_index) != len(time_list) : 42 raise StandardError, 'ExperimentSpec::__init__: lengths of column_index (%d) and time_list (%d) differ' % (len(col_index), len(time_list)) 43 self.column_index = copy.deepcopy(col_index) 44 self.time_list = copy.deepcopy(time_list) 45 self.title = title
46 47
48 - def num_columns(self) :
49 return len(self.time_list)
50 51
52 - def gene_plot_data(self, arrdata, row_index) :
53 row = arrdata.data[row_index] 54 pd = [] 55 for i in range(self.num_columns()) : 56 ci = self.column_index[i] 57 if type(row[ci]) is types.FloatType : 58 pd.append([self.time_list[i], row[ci]]) 59 return pd
60 61
62 - def write_gnuplot_file(self, f, arrdata) :
63 for row_index in range(arrdata.num_rows()) : 64 pd = self.gene_plot_data(arrdata, row_index) 65 f.write('# %s\n' % arrdata.data[row_index][0]) 66 if len(pd) > 0 : 67 for p in pd : 68 f.write('%f %f\n' % (p[0], p[1])) 69 f.write('\n')
70 71
72 -class MicroarrayData :
73
74 - def __init__(self) :
75 self.column_label = [] 76 self.data = [] 77 self.gene_dict = {}
78 79
80 - def setup_gene_dict(self) :
81 self.gene_dict = {} 82 for i in range(len(self.data)) : 83 gene_id = self.data[i][0] 84 if type(gene_id) is not types.StringType : 85 raise StandardError, 'MicroarrayData::setup_gene_dict: non-string gene ID' 86 if gene_id == '' : 87 raise StandardError, 'MicroarrayData::setup_gene_dict: empty gene ID' 88 if gene_id in self.gene_dict.keys() : 89 raise StandardError, 'MicroarrayData::setup_gene_dict: duplicate gene ID %s' % gene_id 90 self.gene_dict[gene_id] = i
91 92
93 - def num_rows(self) :
94 return len(self.data)
95 96
97 - def num_columns(self) :
98 return len(self.column_label)
99 100
101 - def read_file(self, f) :
102 103 def get_line(f) : 104 line = f.readline() 105 while line != '' : 106 if line.strip()[0] == '#' : 107 line = f.readline() 108 else : 109 break 110 return line
111 112 int_re = re.compile('[+-]?[0-9]+') 113 float_re = re.compile('[+-]?([0-9]+(\\.[0-9]+)?)|(([0-9]+)?\\.[0-9]+)([Ee][+-]?[0-9]+)?') 114 line = get_line(f) 115 if line == '' : 116 raise StandardError, 'MicroarrayData::read_file: no header line' 117 if line[-1] == '\n' : 118 line = line[:-1] 119 self.column_label = line.split('\t') 120 line = get_line(f) 121 while line != '' : 122 if line[-1] == '\n' : 123 line = line[:-1] 124 raw_vlist = line.split('\t') 125 vlist = [] 126 for raw_v in raw_vlist : 127 m = float_re.match(raw_v) 128 if m : 129 vlist.append(string.atof(raw_v)) 130 else : 131 vlist.append(raw_v) 132 self.data.append(vlist) 133 line = get_line(f) 134 self.setup_gene_dict()
135 136
137 - def column_type(self, col) :
138 for row in self.data : 139 if len(row) < col : 140 raise StandardError, 'MicroarrayData::column_type: column index %d out of range' % col 141 if type(row[col]) is not types.FloatType : 142 return types.StringType 143 return types.FloatType
144 145
146 - def write_gnuplot_file(self, f) :
147 floatrows = [] 148 for i in xrange(len(self.column_label)) : 149 if self.column_type is types.FloatType : 150 f.write('# %d: %s\n' % (i, self.column_label[i])) 151 floatrows.append(i) 152 for row in self.data : 153 for i in floatrows : 154 f.write('%1.12g ' % row[i]) 155 f.write('\n')
156 157
158 - def regulation_index(self, row_index) :
159 n = 0 160 s = 0.0 161 for x in self.data[row_index] : 162 if type(x) is types.FloatType : 163 n = n + 1 164 s = s + abs(x) 165 if n == 0 : 166 return None 167 return s / n
168 169
170 - def regulation_index_list(self) :
171 l = [] 172 for row_index in range(self.num_rows()) : 173 r = self.regulation_index(row_index) 174 if r is not None : 175 l.append((self.data[row_index][0], r)) 176 l.sort(cmp_by_score) 177 return l
178 179
180 - def experiment_plots(self, gpcfile, espec_list, histo_max, histo_nbins, basename, cmp_basename = None) :
181 """write a gnuplot file containing the expression profiles of all genes, and 182 also a histogram showing a profile of regulatory strengths""" 183 for e in espec_list : 184 fname = '%s_%s.plt' % (basename, e.title) 185 f = open(fname, 'w') 186 e.write_gnuplot_file(f, self) 187 f.close() 188 gpcfile.write('plot \'%s\' with linespoints' % fname) 189 if cmp_basename : 190 cmp_fname = '%s_%s.plt' % (cmp_basename, e.title) 191 gpcfile.write(', \'%s\' with linespoints\n' % cmp_fname) 192 gpcfile.write('\n') 193 gpcfile.write('pause -1 \'Hit return\'\n') 194 fname = '%s_strength.plt' % basename 195 f = open(fname, 'w') 196 f.write('# regulation index values\n') 197 histogram = histo_nbins * [0] 198 for r in self.regulation_index_list() : 199 hi = int((r[1] * histo_nbins) / histo_max) 200 if hi < histo_nbins : 201 histogram[hi] = histogram[hi] + 1 202 else : 203 sys.stderr.write('arraydata_plots: regulation strength %f out of histogram range %f: %d >= %d\n' % (r[1], histo_max, hi, histo_nbins)) 204 f.write('# %s\n' % r[0]) 205 f.write('%f\n' % r[1]) 206 f.close() 207 gpcfile.write('plot \'%s\' with boxes' % fname) 208 # if cmp_basename : 209 # cmp_fname = '%s_strength.plt' % cmp_basename 210 # gpcfile.write(', \'%s\' with impulses\n' % cmp_fname) 211 gpcfile.write('\n') 212 gpcfile.write('pause -1 \'Hit return\'\n') 213 fname = '%s_shist.plt' % basename 214 f = open(fname, 'w') 215 f.write('# regulation strength histogram\n') 216 for i in xrange(histo_nbins) : 217 f.write('%d %f %d\n' % (i, (i * histo_max) / histo_nbins, histogram[i])) 218 f.close() 219 gpcfile.write('plot \'%s\' using 2:3 with boxes\n' % fname) 220 gpcfile.write('pause -1 \'Hit return\'\n')
221