Package omero :: Package util :: Module populate_metadata
[hide private]
[frames] | no frames]

Source Code for Module omero.util.populate_metadata

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """ 
  4  Populate bulk metadata tables from delimited text files. 
  5  """ 
  6   
  7  # 
  8  #  Copyright (C) 2011 University of Dundee. All rights reserved. 
  9  # 
 10  # 
 11  #  This program is free software; you can redistribute it and/or modify 
 12  #  it under the terms of the GNU General Public License as published by 
 13  #  the Free Software Foundation; either version 2 of the License, or 
 14  #  (at your option) any later version. 
 15  #  This program is distributed in the hope that it will be useful, 
 16  #  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  #  GNU General Public License for more details. 
 19  # 
 20  #  You should have received a copy of the GNU General Public License along 
 21  #  with this program; if not, write to the Free Software Foundation, Inc., 
 22  #  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 23  # 
 24   
 25   
 26  import tempfile 
 27  import logging 
 28  import time 
 29  import sys 
 30  import csv 
 31  import re 
 32  from threading import Thread 
 33  from StringIO import StringIO 
 34  from getpass import getpass 
 35  from getopt import getopt, GetoptError 
 36  from Queue import Queue 
 37   
 38  import omero.clients 
 39  from omero.rtypes import rdouble, rstring, rint 
 40  from omero.model import DatasetAnnotationLink, DatasetI, FileAnnotationI, \ 
 41                          OriginalFileI, PlateI, PlateAnnotationLinkI, ScreenI, \ 
 42                          ScreenAnnotationLinkI 
 43  from omero.grid import ImageColumn, LongColumn, PlateColumn, StringColumn, \ 
 44                         WellColumn 
 45  from omero.util.temp_files import create_path, remove_path 
 46  from omero import client 
 47   
 48  from populate_roi import ThreadPool 
 49   
 50  # Handle Python 2.5 built-in ElementTree 
 51  try: 
 52          from xml.etree.cElementTree import XML, Element, SubElement, ElementTree, dump, iterparse 
 53  except ImportError: 
 54          from cElementTree import XML, Element, SubElement, ElementTree, dump, iterparse 
 55   
 56  log = logging.getLogger("omero.util.populate_metadata") 
 57   
58 -def usage(error):
59 """Prints usage so that we don't have to. :)""" 60 cmd = sys.argv[0] 61 print """%s 62 Usage: %s [options] <target_object> <file> 63 Runs metadata population code for a given object. 64 65 Options: 66 -s OMERO hostname to use [defaults to "localhost"] 67 -p OMERO port to use [defaults to 4064] 68 -u OMERO username to use 69 -w OMERO password 70 -k OMERO session key to use 71 -i Dump measurement information and exit (no population) 72 -d Print debug statements 73 74 Examples: 75 %s -s localhost -p 14064 -u bob Plate:6 metadata.csv 76 77 Report bugs to ome-devel@lists.openmicroscopy.org.uk""" % (error, cmd, cmd) 78 sys.exit(2)
79 80 # Global thread pool for use by workers 81 thread_pool = None 82 83 # Special column names we may add depending on the data type 84 PLATE_NAME_COLUMN = 'Plate Name' 85 WELL_NAME_COLUMN = 'Well Name' 86
87 -class Skip(object):
88 """Instance to denote a row skip request.""" 89 pass
90
91 -class MetadataError(Exception):
92 """ 93 Raised by the metadata parsing context when an error condition 94 is reached. 95 """ 96 pass
97
98 -class HeaderResolver(object):
99 """ 100 Header resolver for known header names which is responsible for creating 101 the column set for the OMERO.tables instance. 102 """ 103 104 DEFAULT_COLUMN_SIZE = 1 105 106 plate_keys = { 107 'well': WellColumn, 108 'field': ImageColumn, 109 'row': LongColumn, 110 'column': LongColumn, 111 'wellsample': ImageColumn 112 } 113 114 screen_keys = dict({ 115 'plate': PlateColumn, 116 }, **plate_keys) 117
118 - def __init__(self, target_object, headers):
119 self.target_object = target_object 120 self.headers = [v.replace('/', '\\') for v in headers] 121 self.headers_as_lower = [v.lower() for v in self.headers]
122
123 - def create_columns(self):
124 target_class = self.target_object.__class__ 125 target_id = self.target_object.id.val 126 if ScreenI is target_class: 127 log.debug('Creating columns for Screen:%d' % target_id) 128 return self.create_columns_screen() 129 if PlateI is target_class: 130 log.debug('Creating columns for Plate:%d' % target_id) 131 return self.create_columns_plate() 132 if DatasetI is target_class: 133 log.debug('Creating columns for Dataset:%d' % target_id) 134 return self.create_columns_dataset() 135 raise MetadataError('Unsupported target object class: %s' \ 136 % target_class)
137
138 - def create_columns_screen(self):
139 columns = list() 140 for i, header_as_lower in enumerate(self.headers_as_lower): 141 name = self.headers[i] 142 try: 143 column = self.screen_keys[header_as_lower](name, '', list()) 144 except KeyError: 145 column = StringColumn(name, '', self.DEFAULT_COLUMN_SIZE, 146 list()) 147 columns.append(column) 148 for column in columns: 149 if column.__class__ is PlateColumn: 150 columns.append(StringColumn(PLATE_NAME_COLUMN, '', 151 self.DEFAULT_COLUMN_SIZE, list())) 152 if column.__class__ is WellColumn: 153 columns.append(StringColumn(WELL_NAME_COLUMN, '', 154 self.DEFAULT_COLUMN_SIZE, list())) 155 return columns
156
157 - def create_columns_plate(self):
158 columns = list() 159 for i, header_as_lower in enumerate(self.headers_as_lower): 160 name = self.headers[i] 161 try: 162 column = self.plate_keys[header_as_lower](name, '', list()) 163 except KeyError: 164 column = StringColumn(name, '', self.DEFAULT_COLUMN_SIZE, 165 list()) 166 columns.append(column) 167 for column in columns: 168 if column.__class__ is PlateColumn: 169 columns.append(StringColumn(PLATE_NAME_COLUMN, '', 170 self.DEFAULT_COLUMN_SIZE, list())) 171 if column.__class__ is WellColumn: 172 columns.append(StringColumn(WELL_NAME_COLUMN, '', 173 self.DEFAULT_COLUMN_SIZE, list())) 174 return columns
175
176 - def create_columns_dataset(self):
177 raise Exception('To be implemented!')
178
179 -class ValueResolver(object):
180 """ 181 Value resolver for column types which is responsible for filling up 182 non-metadata columns with their OMERO data model identifiers. 183 """ 184 185 AS_ALPHA = [chr(v) for v in range(97, 122 + 1)] # a-z 186 WELL_REGEX = re.compile(r'^([a-zA-Z]+)(\d+)$') 187
188 - def __init__(self, client, target_object):
189 self.client = client 190 self.target_object = target_object 191 self.target_class = self.target_object.__class__ 192 if PlateI is self.target_class: 193 return self.load_plate() 194 if DatasetI is self.target_class: 195 return self.load_dataset() 196 if ScreenI is self.target_class: 197 return self.load_screen() 198 raise MetadataError('Unsupported target object class: %s' \ 199 % target_class)
200 - def load_screen(self):
201 query_service = self.client.getSession().getQueryService() 202 parameters = omero.sys.ParametersI() 203 parameters.addId(self.target_object.id.val) 204 log.debug('Loading Screen:%d' % self.target_object.id.val) 205 self.target_object = query_service.findByQuery( 206 'select s from Screen as s ' 207 'join fetch s.plateLinks as p_link ' 208 'join fetch p_link.child as p ' 209 'where s.id = :id', parameters, {'omero.group': '-1'}) 210 if self.target_object is None: 211 raise MetadataError('Could not find target object!') 212 self.wells_by_location = dict() 213 self.wells_by_id = dict() 214 self.plates_by_name = dict() 215 self.plates_by_id = dict() 216 for plate in (l.child for l in self.target_object.copyPlateLinks()): 217 parameters = omero.sys.ParametersI() 218 parameters.addId(plate.id.val) 219 plate = query_service.findByQuery( 220 'select p from Plate as p ' 221 'join fetch p.wells as w ' 222 'join fetch w.wellSamples as ws ' 223 'where p.id = :id', parameters, {'omero.group': '-1'}) 224 self.plates_by_name[plate.name.val] = plate 225 self.plates_by_id[plate.id.val] = plate 226 wells_by_location = dict() 227 wells_by_id = dict() 228 self.wells_by_location[plate.name.val] = wells_by_location 229 self.wells_by_id[plate.id.val] = wells_by_id 230 self.parse_plate(plate, wells_by_location, wells_by_id)
231
232 - def load_plate(self):
233 query_service = self.client.getSession().getQueryService() 234 parameters = omero.sys.ParametersI() 235 parameters.addId(self.target_object.id.val) 236 log.debug('Loading Plate:%d' % self.target_object.id.val) 237 self.target_object = query_service.findByQuery( 238 'select p from Plate as p ' 239 'join fetch p.wells as w ' 240 'join fetch w.wellSamples as ws ' 241 'where p.id = :id', parameters, {'omero.group': '-1'}) 242 if self.target_object is None: 243 raise MetadataError('Could not find target object!') 244 self.wells_by_location = dict() 245 self.wells_by_id = dict() 246 wells_by_location = dict() 247 wells_by_id = dict() 248 self.wells_by_location[self.target_object.name.val] = wells_by_location 249 self.wells_by_id[self.target_object.id.val] = wells_by_id 250 self.parse_plate(self.target_object, wells_by_location, wells_by_id)
251
252 - def parse_plate(self, plate, wells_by_location, wells_by_id):
253 # TODO: This should use the PlateNamingConvention. We're assuming rows 254 # as alpha and columns as numeric. 255 for well in plate.copyWells(): 256 wells_by_id[well.id.val] = well 257 row = well.row.val 258 # 0 offsetted is not what people use in reality 259 column = str(well.column.val + 1) 260 try: 261 columns = wells_by_location[self.AS_ALPHA[row]] 262 except KeyError: 263 wells_by_location[self.AS_ALPHA[row]] = columns = dict() 264 columns[column] = well 265 log.debug('Completed parsing plate: %s' % plate.name.val) 266 for row in wells_by_location: 267 log.debug('%s: %r' % (row, wells_by_location[row].keys()))
268
269 - def load_dataset(self):
270 raise Exception('To be implemented!')
271
272 - def resolve(self, column, value, row):
273 column_class = column.__class__ 274 column_as_lower = column.name.lower() 275 if WellColumn is column_class: 276 m = self.WELL_REGEX.match(value) 277 if m is None or len(m.groups()) != 2: 278 raise MetadataError( 279 'Cannot parse well identifier "%s" from row: %r' % \ 280 (value, [o[1] for o in row])) 281 plate_row = m.group(1).lower() 282 plate_column = str(long(m.group(2))) 283 if len(self.wells_by_location) == 1: 284 wells_by_location = self.wells_by_location.values()[0] 285 log.debug('Parsed "%s" row: %s column: %s' % \ 286 (value, plate_row, plate_column)) 287 else: 288 for column, plate in row: 289 if column.__class__ is PlateColumn: 290 wells_by_location = self.wells_by_location[plate] 291 log.debug('Parsed "%s" row: %s column: %s plate: %s' % \ 292 (value, plate_row, plate_column, plate)) 293 break 294 try: 295 return wells_by_location[plate_row][plate_column].id.val 296 except KeyError: 297 log.debug('Row: %s Column: %s not found!' % \ 298 (plate_row, plate_column)) 299 return -1L 300 if PlateColumn is column_class: 301 try: 302 return self.plates_by_name[value].id.val 303 except KeyError: 304 log.warn('Screen is missing plate: %s' % value) 305 return Skip() 306 if column_as_lower in ('row', 'column') \ 307 and column_class is LongColumn: 308 try: 309 # The value is not 0 offsetted 310 return long(value) - 1 311 except ValueError: 312 return long(self.AS_ALPHA.index(value.lower())) 313 if StringColumn is column_class: 314 return value 315 raise MetadataError('Unsupported column class: %s' % column_class)
316
317 -class ParsingContext(object):
318 """Generic parsing context for CSV files.""" 319
320 - def __init__(self, client, target_object, file):
321 self.client = client 322 self.target_object = target_object 323 self.file = file 324 self.value_resolver = ValueResolver(self.client, self.target_object)
325 336
337 - def get_column_widths(self):
338 widths = list() 339 for column in self.columns: 340 try: 341 widths.append(column.size) 342 except AttributeError: 343 widths.append(None) 344 return widths
345
346 - def parse(self):
347 data = open(self.file, 'U') 348 try: 349 rows = list(csv.reader(data, delimiter=',')) 350 finally: 351 data.close() 352 log.debug('Header: %r' % rows[0]) 353 header_resolver = HeaderResolver(self.target_object, rows[0]) 354 self.columns = header_resolver.create_columns() 355 log.debug('Columns: %r' % self.columns) 356 self.populate(rows[1:]) 357 self.post_process() 358 log.debug('Column widths: %r' % self.get_column_widths()) 359 log.debug('Columns: %r' % \ 360 [(o.name, len(o.values)) for o in self.columns])
361 # Paranoid debugging 362 #for i in range(len(self.columns[0].values)): 363 # values = list() 364 # for column in self.columns: 365 # values.append(column.values[i]) 366 # log.debug('Row: %r' % values) 367
368 - def populate(self, rows):
369 value = None 370 for row in rows: 371 values = list() 372 row = [(self.columns[i], value) for i, value in enumerate(row)] 373 for column, original_value in row: 374 value = self.value_resolver.resolve(column, original_value, row) 375 if value.__class__ is Skip: 376 break 377 values.append(value) 378 try: 379 if value.__class__ is not long: 380 column.size = max(column.size, len(value)) 381 except TypeError: 382 log.error('Original value "%s" now "%s" of bad type!' % \ 383 (original_value, value)) 384 raise 385 if value.__class__ is not Skip: 386 values.reverse() 387 for column in self.columns: 388 if column.name in (PLATE_NAME_COLUMN, WELL_NAME_COLUMN): 389 continue 390 try: 391 column.values.append(values.pop()) 392 except IndexError: 393 log.error('Column %s has no values to pop.' % \ 394 column.name) 395 raise
396
397 - def post_process(self):
398 columns_by_name = dict() 399 plate_column = None 400 well_column = None 401 well_name_column = None 402 plate_name_column = None 403 for column in self.columns: 404 columns_by_name[column.name] = column 405 if column.__class__ is PlateColumn: 406 plate_column = column 407 elif column.__class__ is WellColumn: 408 well_column = column 409 elif column.name == WELL_NAME_COLUMN: 410 well_name_column = column 411 elif column.name == PLATE_NAME_COLUMN: 412 plate_name_column = column 413 if well_name_column is None and plate_name_column is None: 414 log.info('Nothing to do during post processing.') 415 for i in range(0, len(self.columns[0].values)): 416 if well_name_column is not None: 417 if PlateI is self.value_resolver.target_class: 418 plate = self.value_resolver.target_object.id.val 419 elif ScreenI is self.value_resolver.target_class: 420 plate = columns_by_name['Plate'].values[i] 421 try: 422 well = self.value_resolver.wells_by_id[plate] 423 well = well[well_column.values[i]] 424 row = well.row.val 425 col = well.column.val 426 except KeyError: 427 log.error('Missing row or column for well name population!') 428 raise 429 row = self.value_resolver.AS_ALPHA[row] 430 v = '%s%d' % (row, col + 1) 431 well_name_column.size = max(well_name_column.size, len(v)) 432 well_name_column.values.append(v) 433 else: 434 log.info('Missing well name column, skipping.') 435 if plate_name_column is not None: 436 plate = columns_by_name['Plate'].values[i] 437 plate = self.value_resolver.plates_by_id[plate] 438 v = plate.name.val 439 plate_name_column.size = max(plate_name_column.size, len(v)) 440 plate_name_column.values.append(v) 441 else: 442 log.info('Missing plate name column, skipping.')
443
444 - def write_to_omero(self):
445 sf = self.client.getSession() 446 sr = sf.sharedResources() 447 update_service = sf.getUpdateService() 448 name = 'bulk_annotations' 449 table = sr.newTable(1, name) 450 if table is None: 451 raise MetadataError( 452 "Unable to create table: %s" % name) 453 original_file = table.getOriginalFile() 454 log.info('Created new table OriginalFile:%d' % original_file.id.val) 455 table.initialize(self.columns) 456 log.info('Table initialized with %d columns.' % (len(self.columns))) 457 table.addData(self.columns) 458 log.info('Added data column data.') 459 table.close() 460 file_annotation = FileAnnotationI() 461 file_annotation.ns = \ 462 rstring('openmicroscopy.org/omero/bulk_annotations') 463 file_annotation.description = rstring(name) 464 file_annotation.file = OriginalFileI(original_file.id.val, False) 465 link = self.create_annotation_link() 466 link.parent = self.target_object 467 link.child = file_annotation 468 group = str(self.value_resolver.target_object.details.group.id.val) 469 update_service.saveObject(link, {'omero.group': group})
470
471 -def parse_target_object(target_object):
472 type, id = target_object.split(':') 473 if 'Dataset' == type: 474 return DatasetI(long(id), False) 475 if 'Plate' == type: 476 return PlateI(long(id), False) 477 if 'Screen' == type: 478 return ScreenI(long(id), False) 479 raise ValueError('Unsupported target object: %s' % target_object)
480 481 if __name__ == "__main__": 482 try: 483 options, args = getopt(sys.argv[1:], "s:p:u:w:k:id") 484 except GetoptError, (msg, opt): 485 usage(msg) 486 487 try: 488 target_object, file = args 489 target_object = parse_target_object(target_object) 490 except ValueError: 491 usage('Target object and file must be a specified!') 492 493 username = None 494 password = None 495 hostname = 'localhost' 496 port = 4064 # SSL 497 info = False 498 session_key = None 499 logging_level = logging.INFO 500 thread_count = 1 501 for option, argument in options: 502 if option == "-u": 503 username = argument 504 if option == "-w": 505 password = argument 506 if option == "-s": 507 hostname = argument 508 if option == "-p": 509 port = int(argument) 510 if option == "-i": 511 info = True 512 if option == "-k": 513 session_key = argument 514 if option == "-d": 515 logging_level = logging.DEBUG 516 if option == "-t": 517 thread_count = int(argument) 518 if session_key is None and username is None: 519 usage("Username must be specified!") 520 if session_key is None and hostname is None: 521 usage("Host name must be specified!") 522 if session_key is None and password is None: 523 password = getpass() 524 525 logging.basicConfig(level = logging_level) 526 client = client(hostname, port) 527 client.setAgent("OMERO.populate_metadata") 528 client.enableKeepAlive(60) 529 try: 530 if session_key is not None: 531 client.joinSession(session_key) 532 else: 533 client.createSession(username, password) 534 535 log.debug('Creating pool of %d threads' % thread_count) 536 thread_pool = ThreadPool(thread_count) 537 ctx = ParsingContext(client, target_object, file) 538 ctx.parse() 539 if not info: 540 ctx.write_to_omero() 541 finally: 542 pass 543 client.closeSession() 544