Source code for tkp.db.orm

"""
This module contains lightweight container objects that corresponds
to a dataset, image or extracted source in the database; it is actually a
mini Object Relation Mapper (ORM). The correspondence between the object
and table row is matched through the private _id attributes.

Each dataset contains several database Images; each Image contains a
number of ExtractedSources. The database Images correspond to the
images table in the database, *not* to sourcefinder images or actual
image data files on disk (this distinction is important; while there
are certainly parts in common, several are not).


The current setup is done in large part to keep the database and
sourcefinder (and other parts of the TKP package) separate; tightly
integrated database tables/sourcefinder images/disk files make it more
difficult to improve the code or distribute parts separately.

Usage
=====

In practice, a DataSet object is created, and separate Images are
created referencing that DataSet() instance; ids are automatically
assigned where necessary (i.e., on creation of a new entry (row) in
the database).

Objects can also be created using an existing id; data is then taken
from the corresponding table row in the database.


Creating new objects
====================

The following code is an usage example, but should not be used as a
doc test (since the database value can differ, and thus the test would
fail)::

    # database sets up and holds the connection to the actual database
    >>> database = tkp.db.database.Database()

    # Each object type takes a data dictionary on creation, which for newly objects
    # has some required keys (& values). For a DataSet, this is only 'description';
    # for an Image, the keys are 'freq_eff', 'freq_bw_', 'taustart_ts',
    # 'tau_time' & 'url'
    # The required values are stored in the the REQUIRED attribute
    >>> dataset = DataSet(data={'description': 'a dataset'}, database=database)

    # Here, dataset indirectly holds the database connection:
    >>> dataset.database
    DataBase(host=heastro1, name=trap, user=trap, ...)
    >>> image1 = Image(data={'freq_eff': '80e6', 'freq_bw': 1e6, \
        'taustart_ts': datetime(2011, 5, 1, 0, 0, 0), 'tau_time': 1800.,  'url': '/'}, dataset=dataset)  # initialize with defaults
        # note the dataset kwarg, which holds the database connection
    >>> image1.tau_time
    1800.
    >>> image1.taustart_ts
    datetime.datetime(2011, 5, 1, 0, 0, 0)
    >>> image2 = Image(data={'freq_eff': '80e6', 'freq_bw': 1e6, \
        'taustart_ts': datetime(2011, 5, 1, 0, 1, 0), 'tau_time': 1500.,'url': '/'}, dataset=dataset)
    >>> image2.tau_time
    1500
    >>> image2.taustart_ts
    datetime.datetime(2011, 5, 1, 0, 1, 0)
    # Images created with a dataset object, are automatically added to that dataset:
    >>> dataset.images
    set([<tkp.database.dataset.Image object at 0x26fb6d0>, <tkp.database.dataset.Image object at 0x26fb790>])

Updating objects
================

To update objects, use the update() method.

This method does two things, in the following order:

1. it updates from the database to the object: if there have been
changes in the database, the object will reflect that after executing
update()

2. then, it updates the object (and the database) with values supplied
by the user. The latter values are optional; no supplied values simply
means there aren't any updates.


    >>> image2.update(tau_time=2500)    # updates the database as well
    >>> image2.tau_time
    2500
    >>> database.cursor.execute("SELECT tau_time FROM images WHERE imageid=%s" % \
                                 (image2.id,))
    >>> database.cursors.fetchone()[0]
    2500
    # Manually update the database
    >>> database.cursor.execute("UPDATE images SET tau_time=2000.0 imageid=%s" % \
                                 (image2.id,))
    >>> image2.tau_time   # not updated yet!
    2500
    >>> image2.update()
    >>> image2.tau_time
    2000


Assigning objects to a table row on creation
============================================

It is also possible to create a DataSet, Image or ExtractedSource instance from the
database, using the ``id`` in the initializer::

    >>> dataset2 = DataSet(id=dataset.id, database=database)
    >>> image3 = Image(imageid=image2.id, database=database)
    >>> image3.tau_time
    2000

If an ``id`` is supplied, ``data`` is ignored.
"""

import logging
from tkp.db.generic import columns_from_table, set_columns_for_table
from tkp.db.general import (insert_dataset, insert_image,
                            insert_extracted_sources, lightcurve)
from tkp.db.associations import associate_extracted_sources
import tkp.db
import tkp.db.quality
from tkp.db.database import Database


logger = logging.getLogger(__name__)


[docs]class DBObject(object):
    """Generic mini-ORM object

    Derived objects will need to implement __init__, which for
    practical reasons is split up in __init__ and _init_data: the
    latter is called at the end __init__, so a derived __init__ would
    have super(Derived, self).__init__() at the start and
    super(Derived, self)._init_data() at the end.

    __init__ takes care of setting the id, the supplied `data` dictionary
    and the connection to the database.

    _init_data sets the actual data either from the database (in case
    of a supplied id) or from the `data` dictionary.
    """

    def __init__(self, data=None, database=None, id=None):
        """Basic initialization.

        Inherited classes need to implement any actual database action,
        by calling self._init_data() at the end of their __init__
        method.
        """
        # Call the id property to set the _id attribute
        self._id = id
        self._data = {} if data is None else data.copy()
        self.database = database

    def _init_data(self):
        """Set up the data, either by creating a new DBOject or
        updating it from the database using the id

        This method should only be called from __init__(), probably at the end.

        Note that this does prevent proper (multi) inheritance,
        because it would get called several times then.

        Raises:

            AttributeError if a required data keyword is missing.

        """
        if self._id is not None:
            # object created using an existing table row
            self.update()
        else:
            # Verify required data keys
            for key in self.REQUIRED:
                if key not in self._data:
                    raise AttributeError("missing required data key: %s" % key)
            self.id

[docs]    def __getattr__(self, name):
        """Obtain the 'name' attribute, where 'name' is a database column name"""
        #DEVELOPERS NOTE: if this property fails for some reason, python will
        #ignore it, and continue using the __getattr__ method. This is very
        #confusing. So if for any reason you are getting 'attribute not found'
        #errors while you don't expect it, I suggest temporarily disabling the
        #__getattr__ method until you located the problem.

        if name in self._data:
            return self._data[name]
        else:
            raise AttributeError("attribute '%s' not found" % name)

    @property
    def id(self):
        """Add or obtain an id to/from the table

        The id is generated if self._id does not exist, effectively
        creating a new row in the database.

        Several containers have their specific SQL function to create
        a new object, so this property will need to overridden.


        """
        if self._id is None:
            query = ("INSERT INTO " + self.TABLE + " (" +
                     ", ".join(self._data.iterkeys()) + ") VALUES (" +
                     ", ".join(["%s"] * len(self._data)) + ")"
                     )
            if self.database.engine == "postgresql":
                query = query + "RETURNING ID"
            values = tuple(self._data.itervalues())
            cursor = self.database.cursor
            try:
                # Insert a default source
                cursor.execute(query, values)
                if not self.database.connection.autocommit:
                    self.database.connection.commit()
                if self.database.engine == "monetdb":
                    self._id = cursor.lastrowid
                elif self.database.engine == "postgresql":
                    self._id = cursor.fetchone()[0]
                else:
                    raise self.database.connection.Error(
                         "Database engine not implemented in ORM.")
            except self.database.connection.Error:
                logger.warn("insertion into database failed: %s",
                             (query % values))
                raise
            except Exception as e:
                logging.error("ORM failed: %s" % str(e))
                raise
        return self._id

[docs]    def update(self, **kwargs):
        """Update attributes from database, and set database values to
        kwargs when provided

        This method performs two functions, the first always and the
        second optionally after the first:

            - it updates the attributes from the database. That is, it
              makes sure the Python instance is synchronized with the
              database.

            - (optional): it sets the column values in the database to
              the values provided through kwargs, for the associated
              database row. Attributes for the instance are of course
              also set to these values. Any kwargs that do not
              correspond to a column name are simply ignored.

        This function therefore first updates the instance from the
        database, and then optionally the database from the instance
        (with the provided keyword arguments).
        """

        self._sync_with_database()
        self._set_data(**kwargs)

    def _sync_with_database(self):
        """Update object attributes from the database"""
        results = columns_from_table(self.TABLE, keywords=None,
                                            where={self.ID: self._id})
        # Shallow copy, but that's ok: all database values are
        # immutable (including datetime objects)
        if results:
            self._data = results[0].copy()
        else:
            self._data = {}

    def _set_data(self, **kwargs):
        """Update the database with the supplied **kwargs.

        Supplied keywords that do not exist in the database will lead
        to a database error.
        """

        if not kwargs:
            return
        set_columns_for_table(self.TABLE, data=kwargs,
                                  where={self.ID: self._id})
        self._data.update(kwargs)


[docs]class DataSet(DBObject):
    """Class corresponding to the dataset table in the database"""

    TABLE = 'dataset'
    ID = 'id'
    REQUIRED = ('description',)

    def __init__(self, data=None, database=None, id=None):
        """If id is supplied, the data and image arguments are ignored."""
        super(DataSet, self).__init__(
            data=data, database=database, id=id)
        self.images = set()
        if not self.database:
            self.database = Database()
        self._init_data()

    def __str__(self):
        return 'DataSet: "%s". Database ID: %s, %d images.' % (
            self.description, str(self.id), len(self.images))

    # Inserting datasets is handled a little different than normal inserts
    # (We make use of the SQL function insertDataset)
    @property
    def id(self):
        """Add or obtain an id to/from the table

        This uses the SQL function insertDataset().
        """
        if self._id is None:
            try:
                self._id = insert_dataset(self._data['description'])
            except Exception as e:
                logger.error("ORM: error inserting dataset,  %s: %s" % (type(e).__name__, str(e)))
                raise
        return self._id

[docs]    def update_images(self):
        """Renew the set of images by getting the images for this
        dataset from the database. Implemented separately from update(),
        since normally this would be too much overhead"""
        query = "SELECT id FROM image WHERE dataset = %s ORDER BY id" % self._id
        cursor = tkp.db.execute(query)
        result = cursor.fetchall()
        image_ids = [row[0] for row in result]
        self.images = [Image(database=self.database, id=id) for id in image_ids]

[docs]    def runcat_entries(self):
        """
        Returns:
            a list of dictionarys:
            representing rows in runningcatalog,
            for all sources belonging to this dataset

            Column 'id' is returned with the key 'runcat'

            Currently only returns 3 columns:
            [{'runcat,'xtrsrc','datapoints'}]
        """
        return columns_from_table('runningcatalog',
                                      keywords=['id', 'xtrsrc', 'datapoints'],
                                      alias={'id':'runcat'},
                                      where={'dataset':self.id})


[docs]    def frequency_bands(self):
        """Return a list of distinct bands present in the dataset."""
        query = """\
        SELECT DISTINCT(band)
          FROM image
         WHERE dataset = %s
        """
        self.database.cursor.execute(query, (self.id,))
        bands = zip(*self.database.cursor.fetchall())[0]
        return bands


[docs]class Image(DBObject):
    """Class corresponding to the images table in the database"""

    TABLE = 'image'
    ID = 'id'
    REQUIRED = ('dataset', 'tau_time', 'freq_eff', 'freq_bw', 'taustart_ts',
                'beam_smaj_pix', 'beam_smin_pix', 'beam_pa_rad',
                'deltax', 'deltay',
                'url', 'centre_ra', 'centre_decl', 'xtr_radius', 'rms_qc')



    def __init__(self, data=None, dataset=None, database=None, id=None):
        """If id is supplied, the data and image arguments are ignored."""
        super(Image, self).__init__(data=data, database=database, id=id)
        # Special part to deal when a DataSet() is supplied
        self.dataset = dataset
        self.rejected = False
        if self.dataset:
            if self.dataset.database and not self.database:
                self.database = self.dataset.database
            self.dataset.images.add(self)
            self._data.setdefault('dataset', self.dataset.id)
        self.sources = set()
        if not self.database:
            self.database = Database()
        self._init_data()
        if not self.dataset:
            self.dataset = DataSet(id=self._data['dataset'], database=self.database)

        self.update_rejected()


    # Inserting images is handled a little different than normal inserts
    # -- We call an SQL function 'insertImage' which takes care of
    #    assigning a new image id.
    @property
    def id(self):
        """Add or obtain an id to/from the table

        This uses the SQL function insertImage()
        """

        if self._id is None:
            try:
                #if 'bsmaj' not in self._data:
                #    self._data['bsmaj'] = None
                #    self._data['bsmin'] = None
                #    self._data['bpa'] = None
                #    self._data['deltax'] = None
                #    self._data['deltay'] = None
                # Insert a default image
                self._id = insert_image(self.dataset.id,
                    self._data['freq_eff'], self._data['freq_bw'],
                    self._data['taustart_ts'], self._data['tau_time'],
                    self._data['beam_smaj_pix'], self._data['beam_smin_pix'],
                    self._data['beam_pa_rad'],
                    self._data['deltax'],
                    self._data['deltay'],
                    self._data['url'],
                    self._data['centre_ra'], #Degrees J2000
                    self._data['centre_decl'], #Degrees J2000
                    self._data['xtr_radius'], #Degrees
                    self._data['rms_qc'],
                    self._data.get('rms_min',None),
                    self._data.get('rms_max',None),
                    self._data.get('detection_thresh',None),
                    self._data.get('analysis_thresh',None),
                )
            except Exception as e:
                logger.error("ORM: error inserting image,  %s: %s" %
                                (type(e).__name__, str(e)))
                raise
        return self._id

[docs]    def update_rejected(self):
        """Update self.rejected with the rejected status. Will be false
        if not rejected, will be a list of reject descriptions if rejected"""
        self.rejected = tkp.db.quality.isrejected(self.id)

[docs]    def update_sources(self):
        """Renew the set of sources by getting the sources for this
        image from the database

        This method is separately implemented, because it's not always necessary
        and potentially (for an image with dozens or more sources) time & memory
        consuming.
        """

        query = "SELECT id FROM extractedsource WHERE image = %s"
        try:
            self.database.cursor.execute(query, (self._id,))
            results = self.database.cursor.fetchall()
        except self.database.connection.Error, e:
            query = query % self._id
            logger.warn("database failed on query: %s", query)
            raise
        sources = set()
        for result in results:
            sources.add(ExtractedSource(database=self.database, id=result[0]))
        self.sources = sources


[docs]    def insert_extracted_sources(self, results, extract='blind'):
        """Insert a list of sources

        Args:

            results (list): list of
                utility.containers.ExtractionResult objects (as
                returned from
                sourcefinder.image.ImageData().extract()), or a list
                of data tuples with the source information as follows:
                (ra, dec,
                ra_fit_err, dec_fit_err,
                peak, peak_err,
                flux, flux_err,
                significance level,
                beam major width (as), beam minor width(as),
                beam parallactic angle
                ew_sys_err, ns_sys_err,
                error_radius).
            extract (str):'blind', 'ff_nd' or 'ff_ms'
                (see db.general.insert_extracted_sources)
       """
       #To do: Figure out a saner method of passing the results around
       # (Namedtuple, for starters?)

        insert_extracted_sources(self._id, results=results, extract_type=extract)

[docs]    def associate_extracted_sources(self, deRuiter_r, new_source_sigma_margin):
        """Associate sources from the last images with previously
        extracted sources within the same dataset

        Args:

            deRuiter_r (float): The De Ruiter radius for source
                association. The default value is set through the
                tkp.config module
        """
        associate_extracted_sources(self._id, deRuiter_r,
                                    new_source_sigma_margin)


[docs]class ExtractedSource(DBObject):
    """Class corresponding to the extractedsource table in the database"""

    TABLE = 'extractedsource'
    ID = 'id'
    REQUIRED = ('image', 'zone',
                'ra', 'decl', 'ra_err', 'decl_err',
                'uncertainty_ew', 'uncertainty_ns',
                'ra_fit_err', 'decl_fit_err', 'ew_sys_err', 'ns_sys_err',
                'error_radius',
                'x', 'y', 'z',
                'racosdecl', 'det_sigma')

    def __init__(self, data=None, image=None, database=None, id=None):
        """If id is supplied, the data and image arguments are ignored."""
        super(ExtractedSource, self).__init__(
            data=data, database=database, id=id)
        # Special part to deal when an Image() is supplied
        self.image = image
        if self.image:
            if self.image.dataset.database and not self.database:
                self.database = self.image.dataset.database
            self.image.sources.add(self)
            self._data.setdefault('image', self.image.id)
        if not self.database:
            raise ValueError(
                "can't create ExtractedSource object without a Database() object")
        self._init_data()

[docs]    def lightcurve(self):
        """Obtain the complete light curve (within the current dataset)
        for this source.

        Returns:
            (list) list of 5-tuples, each tuple being:
                - observation start time as a datetime.datetime object
                - integration time (float)
                - integrated flux (float)
                - integrated flux error (float)
                - database ID of this particular source
        """

        return lightcurve(self._id)