Package omero :: Package util :: Module populate_metadata
[hide private]
[frames] | no frames]

Source Code for Module omero.util.populate_metadata

  1  #!/usr/bin/env python 
  2  # encoding: utf-8 
  3  """ 
  4  Populate bulk metadata tables from delimited text files. 
  5  """ 
  6   
  7  # 
  8  #  Copyright (C) 2011 University of Dundee. All rights reserved. 
  9  # 
 10  # 
 11  #  This program is free software; you can redistribute it and/or modify 
 12  #  it under the terms of the GNU General Public License as published by 
 13  #  the Free Software Foundation; either version 2 of the License, or 
 14  #  (at your option) any later version. 
 15  #  This program is distributed in the hope that it will be useful, 
 16  #  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  #  GNU General Public License for more details. 
 19  # 
 20  #  You should have received a copy of the GNU General Public License along 
 21  #  with this program; if not, write to the Free Software Foundation, Inc., 
 22  #  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 23  # 
 24   
 25   
 26  import exceptions 
 27  import tempfile 
 28  import logging 
 29  import time 
 30  import sys 
 31  import csv 
 32  import re 
 33  from threading import Thread 
 34  from StringIO import StringIO 
 35  from getpass import getpass 
 36  from getopt import getopt, GetoptError 
 37  from Queue import Queue 
 38   
 39  import omero.clients 
 40  from omero.rtypes import rdouble, rstring, rint 
 41  from omero.model import DatasetAnnotationLink, DatasetI, FileAnnotationI, \ 
 42                          OriginalFileI, PlateI, PlateAnnotationLinkI, ScreenI, \ 
 43                          ScreenAnnotationLinkI 
 44  from omero.grid import ImageColumn, LongColumn, PlateColumn, StringColumn, \ 
 45                         WellColumn 
 46  from omero.util.temp_files import create_path, remove_path 
 47  from omero import client 
 48   
 49  from populate_roi import ThreadPool 
 50   
 51  # Handle Python 2.5 built-in ElementTree 
 52  try: 
 53          from xml.etree.cElementTree import XML, Element, SubElement, ElementTree, dump, iterparse 
 54  except ImportError: 
 55          from cElementTree import XML, Element, SubElement, ElementTree, dump, iterparse 
 56   
 57  log = logging.getLogger("omero.util.populate_metadata") 
 58   
59 -def usage(error):
60 """Prints usage so that we don't have to. :)""" 61 cmd = sys.argv[0] 62 print """%s 63 Usage: %s [options] <target_object> <file> 64 Runs metadata population code for a given object. 65 66 Options: 67 -s OMERO hostname to use [defaults to "localhost"] 68 -p OMERO port to use [defaults to 4064] 69 -u OMERO username to use 70 -w OMERO password 71 -k OMERO session key to use 72 -i Dump measurement information and exit (no population) 73 -d Print debug statements 74 75 Examples: 76 %s -s localhost -p 14064 -u bob Plate:6 metadata.csv 77 78 Report bugs to ome-devel@lists.openmicroscopy.org.uk""" % (error, cmd, cmd) 79 sys.exit(2)
80 81 # Global thread pool for use by workers 82 thread_pool = None 83 84 # Special column names we may add depending on the data type 85 PLATE_NAME_COLUMN = 'Plate Name' 86 WELL_NAME_COLUMN = 'Well Name' 87
88 -class Skip(object):
89 """Instance to denote a row skip request.""" 90 pass
91
92 -class MetadataError(Exception):
93 """ 94 Raised by the metadata parsing context when an error condition 95 is reached. 96 """ 97 pass
98
99 -class HeaderResolver(object):
100 """ 101 Header resolver for known header names which is responsible for creating 102 the column set for the OMERO.tables instance. 103 """ 104 105 DEFAULT_COLUMN_SIZE = 1 106 107 plate_keys = { 108 'well': WellColumn, 109 'field': ImageColumn, 110 'row': LongColumn, 111 'column': LongColumn, 112 'wellsample': ImageColumn 113 } 114 115 screen_keys = dict({ 116 'plate': PlateColumn, 117 }, **plate_keys) 118
119 - def __init__(self, target_object, headers):
120 self.target_object = target_object 121 self.headers = [v.replace('/', '\\') for v in headers] 122 self.headers_as_lower = [v.lower() for v in self.headers]
123
124 - def create_columns(self):
125 target_class = self.target_object.__class__ 126 target_id = self.target_object.id.val 127 if ScreenI is target_class: 128 log.debug('Creating columns for Screen:%d' % target_id) 129 return self.create_columns_screen() 130 if PlateI is target_class: 131 log.debug('Creating columns for Plate:%d' % target_id) 132 return self.create_columns_plate() 133 if DatasetI is target_class: 134 log.debug('Creating columns for Dataset:%d' % target_id) 135 return self.create_columns_dataset() 136 raise MetadataError('Unsupported target object class: %s' \ 137 % target_class)
138
139 - def create_columns_screen(self):
140 columns = list() 141 for i, header_as_lower in enumerate(self.headers_as_lower): 142 name = self.headers[i] 143 try: 144 column = self.screen_keys[header_as_lower](name, '', list()) 145 except KeyError: 146 column = StringColumn(name, '', self.DEFAULT_COLUMN_SIZE, 147 list()) 148 columns.append(column) 149 for column in columns: 150 if column.__class__ is PlateColumn: 151 columns.append(StringColumn(PLATE_NAME_COLUMN, '', 152 self.DEFAULT_COLUMN_SIZE, list())) 153 if column.__class__ is WellColumn: 154 columns.append(StringColumn(WELL_NAME_COLUMN, '', 155 self.DEFAULT_COLUMN_SIZE, list())) 156 return columns
157
158 - def create_columns_plate(self):
159 columns = list() 160 for i, header_as_lower in enumerate(self.headers_as_lower): 161 name = self.headers[i] 162 try: 163 column = self.plate_keys[header_as_lower](name, '', list()) 164 except KeyError: 165 column = StringColumn(name, '', self.DEFAULT_COLUMN_SIZE, 166 list()) 167 columns.append(column) 168 return columns
169
170 - def create_columns_dataset(self):
171 raise Exception('To be implemented!')
172
173 -class ValueResolver(object):
174 """ 175 Value resolver for column types which is responsible for filling up 176 non-metadata columns with their OMERO data model identifiers. 177 """ 178 179 AS_ALPHA = [chr(v) for v in range(97, 122 + 1)] # a-z 180 WELL_REGEX = re.compile(r'^([a-zA-Z]+)(\d+)$') 181
182 - def __init__(self, client, target_object):
183 self.client = client 184 self.target_object = target_object 185 self.target_class = self.target_object.__class__ 186 if PlateI is self.target_class: 187 return self.load_plate() 188 if DatasetI is self.target_class: 189 return self.load_dataset() 190 if ScreenI is self.target_class: 191 return self.load_screen() 192 raise MetadataError('Unsupported target object class: %s' \ 193 % target_class)
194 - def load_screen(self):
195 query_service = self.client.getSession().getQueryService() 196 parameters = omero.sys.ParametersI() 197 parameters.addId(self.target_object.id.val) 198 log.debug('Loading Screen:%d' % self.target_object.id.val) 199 self.target_object = query_service.findByQuery( 200 'select s from Screen as s ' 201 'join fetch s.plateLinks as p_link ' 202 'join fetch p_link.child as p ' 203 'where s.id = :id', parameters) 204 if self.target_object is None: 205 raise MetadataException('Could not find target object!') 206 self.wells_by_location = dict() 207 self.plates_by_name = dict() 208 self.plates_by_id = dict() 209 for plate in (l.child for l in self.target_object.copyPlateLinks()): 210 parameters = omero.sys.ParametersI() 211 parameters.addId(plate.id.val) 212 plate = query_service.findByQuery( 213 'select p from Plate as p ' 214 'join fetch p.wells as w ' 215 'join fetch w.wellSamples as ws ' 216 'where p.id = :id', parameters) 217 self.plates_by_name[plate.name.val] = plate 218 self.plates_by_id[plate.id.val] = plate 219 wells_by_location = dict() 220 self.wells_by_location[plate.name.val] = wells_by_location 221 self.parse_plate(plate, wells_by_location)
222
223 - def load_plate(self):
224 query_service = self.client.getSession().getQueryService() 225 parameters = omero.sys.ParametersI() 226 parameters.addId(self.target_object.id.val) 227 log.debug('Loading Plate:%d' % self.target_object.id.val) 228 self.target_object = query_service.findByQuery( 229 'select p from Plate as p ' 230 'join fetch p.wells as w ' 231 'join fetch w.wellSamples as ws ' 232 'where p.id = :id', parameters) 233 if self.target_object is None: 234 raise MetadataException('Could not find target object!') 235 self.wells_by_location = dict() 236 wells_by_location = dict() 237 self.wells_by_location[self.target_object.name.val] = wells_by_location 238 self.parse_plate(self.target_object, wells_by_location)
239
240 - def parse_plate(self, plate, wells_by_location):
241 # TODO: This should use the PlateNamingConvention. We're assuming rows 242 # as alpha and columns as numeric. 243 for well in plate.copyWells(): 244 row = well.row.val 245 # 0 offsetted is not what people use in reality 246 column = str(well.column.val + 1) 247 try: 248 columns = wells_by_location[self.AS_ALPHA[row]] 249 except KeyError: 250 wells_by_location[self.AS_ALPHA[row]] = columns = dict() 251 columns[column] = well 252 log.debug('Completed parsing plate: %s' % plate.name.val) 253 for row in wells_by_location: 254 log.debug('%s: %r' % (row, wells_by_location[row].keys()))
255
256 - def load_dataset(self):
257 raise Exception('To be implemented!')
258
259 - def resolve(self, column, value, row):
260 column_class = column.__class__ 261 column_as_lower = column.name.lower() 262 if WellColumn is column_class: 263 m = self.WELL_REGEX.match(value) 264 if m is None or len(m.groups()) != 2: 265 raise MetadataError( 266 'Cannot parse well identifier "%s" from row: %r' % \ 267 (value, [o[1] for o in row])) 268 plate_row = m.group(1).lower() 269 plate_column = m.group(2) 270 if len(self.wells_by_location) == 1: 271 wells_by_location = self.wells_by_location.values()[0] 272 log.debug('Parsed "%s" row: %s column: %s' % \ 273 (value, plate_row, plate_column)) 274 else: 275 for column, plate in row: 276 if column.__class__ is PlateColumn: 277 wells_by_location = self.wells_by_location[plate] 278 log.debug('Parsed "%s" row: %s column: %s plate: %s' % \ 279 (value, plate_row, plate_column, plate)) 280 break 281 try: 282 return wells_by_location[plate_row][plate_column].id.val 283 except KeyError: 284 log.debug('Row: %s Column: %s not found!' % \ 285 (plate_row, plate_column)) 286 return -1L 287 if PlateColumn is column_class: 288 try: 289 return self.plates_by_name[value].id.val 290 except KeyError: 291 log.warn('Screen is missing plate: %s' % value) 292 return Skip() 293 if column_as_lower in ('row', 'column') \ 294 and column_class is LongColumn: 295 try: 296 # The value is not 0 offsetted 297 return long(value) - 1 298 except ValueError: 299 return long(self.AS_ALPHA.index(value.lower())) 300 if StringColumn is column_class: 301 return value 302 raise MetadataError('Unsupported column class: %s' % column_class)
303
304 -class ParsingContext(object):
305 """Generic parsing context for CSV files.""" 306
307 - def __init__(self, client, target_object, file):
308 self.client = client 309 self.target_object = target_object 310 self.file = file 311 self.value_resolver = ValueResolver(self.client, self.target_object)
312 323
324 - def get_column_widths(self):
325 widths = list() 326 for column in self.columns: 327 try: 328 widths.append(column.size) 329 except AttributeError: 330 widths.append(None) 331 return widths
332
333 - def parse(self):
334 data = open(self.file, 'U') 335 try: 336 rows = list(csv.reader(data, delimiter=',')) 337 finally: 338 data.close() 339 log.debug('Header: %r' % rows[0]) 340 header_resolver = HeaderResolver(self.target_object, rows[0]) 341 self.columns = header_resolver.create_columns() 342 log.debug('Columns: %r' % self.columns) 343 self.populate(rows[1:]) 344 self.post_process() 345 log.debug('Column widths: %r' % self.get_column_widths()) 346 log.debug('Columns: %r' % \ 347 [(o.name, len(o.values)) for o in self.columns])
348 # Paranoid debugging 349 #for i in range(len(self.columns[0].values)): 350 # values = list() 351 # for column in self.columns: 352 # values.append(column.values[i]) 353 # log.debug('Row: %r' % values) 354
355 - def populate(self, rows):
356 value = None 357 for row in rows: 358 values = list() 359 row = [(self.columns[i], value) for i, value in enumerate(row)] 360 for column, original_value in row: 361 value = self.value_resolver.resolve(column, original_value, row) 362 if value.__class__ is Skip: 363 break 364 values.append(value) 365 try: 366 if value.__class__ is not long: 367 column.size = max(column.size, len(value)) 368 except TypeError: 369 log.error('Original value "%s" now "%s" of bad type!' % \ 370 (original_value, value)) 371 raise 372 if value.__class__ is not Skip: 373 values.reverse() 374 for column in self.columns: 375 if column.name in (PLATE_NAME_COLUMN, WELL_NAME_COLUMN): 376 continue 377 try: 378 column.values.append(values.pop()) 379 except IndexError: 380 log.error('Column %s has no values to pop.' % \ 381 column.name) 382 raise
383
384 - def post_process(self):
385 columns_by_name = dict() 386 plate_column = None 387 well_column = None 388 well_name_column = None 389 plate_name_column = None 390 for column in self.columns: 391 columns_by_name[column.name] = column 392 if column.__class__ is PlateColumn: 393 plate_column = column 394 elif column.__class__ is WellColumn: 395 well_column = column 396 elif column.name == WELL_NAME_COLUMN: 397 well_name_column = column 398 elif column.name == PLATE_NAME_COLUMN: 399 plate_name_column = column 400 if well_name_column is None and plate_name_column is None: 401 log.info('Nothing to do during post processing.') 402 for i in range(0, len(self.columns[0].values)): 403 if well_name_column is not None: 404 try: 405 row = columns_by_name['Row'].values[i] 406 col = columns_by_name['Column'].values[i] 407 except KeyError: 408 log.error('Missing row or column for well name population!') 409 raise 410 row = self.value_resolver.AS_ALPHA[row] 411 v = '%s%d' % (row, col + 1) 412 well_name_column.size = max(well_name_column.size, len(v)) 413 well_name_column.values.append(v) 414 else: 415 log.info('Missing well name column, skipping.') 416 if plate_name_column is not None: 417 plate = columns_by_name['Plate'].values[i] 418 plate = self.value_resolver.plates_by_id[plate] 419 v = plate.name.val 420 plate_name_column.size = max(plate_name_column.size, len(v)) 421 plate_name_column.values.append(v) 422 else: 423 log.info('Missing plate name column, skipping.')
424
425 - def write_to_omero(self):
426 sf = self.client.getSession() 427 sr = sf.sharedResources() 428 update_service = sf.getUpdateService() 429 name = 'bulk_annotations' 430 table = sr.newTable(1, name) 431 original_file = table.getOriginalFile() 432 if table is None: 433 raise MetadataError( 434 "Unable to create table: %s" % name) 435 log.info('Created new table OriginalFile:%d' % original_file.id.val) 436 table.initialize(self.columns) 437 log.info('Table initialized with %d columns.' % (len(self.columns))) 438 table.addData(self.columns) 439 log.info('Added data column data.') 440 file_annotation = FileAnnotationI() 441 file_annotation.ns = \ 442 rstring('openmicroscopy.org/omero/bulk_annotations') 443 file_annotation.description = rstring(name) 444 file_annotation.file = OriginalFileI(original_file.id.val, False) 445 link = self.create_annotation_link() 446 link.parent = self.target_object 447 link.child = file_annotation 448 update_service.saveObject(link)
449
450 -def parse_target_object(target_object):
451 type, id = target_object.split(':') 452 if 'Dataset' == type: 453 return DatasetI(long(id), False) 454 if 'Plate' == type: 455 return PlateI(long(id), False) 456 if 'Screen' == type: 457 return ScreenI(long(id), False) 458 raise ValueError('Unsupported target object: %s' % target_object)
459 460 if __name__ == "__main__": 461 try: 462 options, args = getopt(sys.argv[1:], "s:p:u:w:k:id") 463 except GetoptError, (msg, opt): 464 usage(msg) 465 466 try: 467 target_object, file = args 468 target_object = parse_target_object(target_object) 469 except ValueError: 470 usage('Target object and file must be a specified!') 471 472 username = None 473 password = None 474 hostname = 'localhost' 475 port = 4064 # SSL 476 info = False 477 session_key = None 478 logging_level = logging.INFO 479 thread_count = 1 480 for option, argument in options: 481 if option == "-u": 482 username = argument 483 if option == "-w": 484 password = argument 485 if option == "-s": 486 hostname = argument 487 if option == "-p": 488 port = int(argument) 489 if option == "-i": 490 info = True 491 if option == "-k": 492 session_key = argument 493 if option == "-d": 494 logging_level = logging.DEBUG 495 if option == "-t": 496 thread_count = int(argument) 497 if session_key is None and username is None: 498 usage("Username must be specified!") 499 if session_key is None and hostname is None: 500 usage("Host name must be specified!") 501 if session_key is None and password is None: 502 password = getpass() 503 504 logging.basicConfig(level = logging_level) 505 client = client(hostname, port) 506 client.setAgent("OMERO.populate_metadata") 507 client.enableKeepAlive(60) 508 try: 509 if session_key is not None: 510 client.joinSession(session_key) 511 else: 512 client.createSession(username, password) 513 514 log.debug('Creating pool of %d threads' % thread_count) 515 thread_pool = ThreadPool(thread_count) 516 ctx = ParsingContext(client, target_object, file) 517 ctx.parse() 518 if not info: 519 ctx.write_to_omero() 520 finally: 521 pass 522 client.closeSession() 523