Source code for rios.calcstats

"""
This module creates pyramid layers and calculates statistics for image
files. Much of it was originally for ERDAS Imagine files but should work 
with any other format that supports pyramid layers and statistics

"""
# This file is part of RIOS - Raster I/O Simplification
# Copyright (C) 2012  Sam Gillingham, Neil Flood
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import numpy
from osgeo import gdal
gdal.UseExceptions()
from distutils.version import LooseVersion
from . import cuiprogress
from .rioserrors import ProcessCancelledError

# Test whether we have access to the GDAL RFC40 facilities
haveRFC40 = False
if (os.getenv('RIOS_HISTOGRAM_IGNORE_RFC40') is None and 
        hasattr(gdal.RasterAttributeTable, 'ReadAsArray')):
    haveRFC40 = True

# test if https://trac.osgeo.org/gdal/ticket/6854 has been fixed
# this allows us to use the rat.SetLinearBinning call rather than metadata
if hasattr(gdal, '__version__'):
    # Fail slightly less drastically when running from ReadTheDocs
    haveLinearBinningFix = LooseVersion(gdal.__version__) >= LooseVersion('2.2.0')
else:
    haveLinearBinningFix = False

# When calculating overviews (i.e. pyramid layers), default behaviour
# is controlled by these
dfltOverviewLvls = os.getenv('RIOS_DFLT_OVERVIEWLEVELS')
if dfltOverviewLvls is None:
    DEFAULT_OVERVIEWLEVELS = [ 4, 8, 16, 32, 64, 128, 256, 512 ]
else:
    DEFAULT_OVERVIEWLEVELS = [int(i) for i in dfltOverviewLvls.split(',')]
DEFAULT_MINOVERVIEWDIM = int(os.getenv('RIOS_DFLT_MINOVERLEVELDIM', default=33))
DEFAULT_OVERVIEWAGGREGRATIONTYPE = os.getenv('RIOS_DFLT_OVERVIEWAGGTYPE', 
    default="AVERAGE")


[docs]def progressFunc(value,string,userdata):
    """
    Progress callback for BuildOverviews
    """
    percent = (userdata.curroffset + (value / userdata.nbands) * 100)
    userdata.progress.setProgress(percent)
    if value == 1.0:
        userdata.curroffset = userdata.curroffset + 100.0 / userdata.nbands
    return not userdata.progress.wasCancelled()
  
# make userdata object with progress and num bands
[docs]class ProgressUserData(object):
    pass

[docs]def addPyramid(ds, progress, 
        minoverviewdim=DEFAULT_MINOVERVIEWDIM, 
        levels=DEFAULT_OVERVIEWLEVELS,
        aggregationType=None):
    """
    Adds Pyramid layers to the dataset. Adds levels until
    the raster dimension of the overview layer is < minoverviewdim,
    up to a maximum level controlled by the levels parameter. 
    
    Uses gdal.Dataset.BuildOverviews() to do the work. 
    
    """
    progress.setLabelText("Computing Pyramid Layers...")
    progress.setProgress(0)

    # first we work out how many overviews to build based on the size
    if ds.RasterXSize < ds.RasterYSize:
        mindim = ds.RasterXSize
    else:
        mindim = ds.RasterYSize
    
    nOverviews = 0
    for i in levels:
        if (mindim // i ) > minoverviewdim:
            nOverviews = nOverviews + 1

    # Need to find out if we are thematic or continuous. 
    tmpmeta = ds.GetRasterBand(1).GetMetadata()
    if aggregationType is None:
        if 'LAYER_TYPE' in tmpmeta:
            if tmpmeta['LAYER_TYPE'] == 'athematic':
                aggregationType = "AVERAGE"
            else:
                aggregationType = "NEAREST"
        else:
            aggregationType = DEFAULT_OVERVIEWAGGREGRATIONTYPE
    
    userdata = ProgressUserData()
    userdata.progress = progress
    userdata.nbands = ds.RasterCount
    userdata.curroffset = 0
   
    ds.BuildOverviews(aggregationType, levels[:nOverviews], progressFunc, userdata )
  
    if progress.wasCancelled():
        raise ProcessCancelledError()

    # make sure it goes to 100%
    progress.setProgress(100)

[docs]def findOrCreateColumn(ratObj, usage, name, dtype):
    """
    Returns the index of an existing column matched
    on usage. Creates it if not already existing using 
    the supplied name and dtype
    Returns a tupe with index and a boolean specifying if 
    it is a new column or not
    """
    ncols = ratObj.GetColumnCount()
    for col in range(ncols):
        if ratObj.GetUsageOfCol(col) == usage:
            return col, False

    # got here so can't exist
    ratObj.CreateColumn(name, dtype, usage)
    # new one will be last col
    return ncols, True

gdalLargeIntTypes = set([gdal.GDT_Int16, gdal.GDT_UInt16, gdal.GDT_Int32, gdal.GDT_UInt32])
gdalFloatTypes = set([gdal.GDT_Float32, gdal.GDT_Float64])
[docs]def addStatistics(ds,progress,ignore=None):
    """
    Calculates statistics and adds them to the image
    
    Uses gdal.Band.ComputeStatistics() for mean, stddev, min and max,
    and gdal.Band.GetHistogram() to do histogram calculation. 
    The median and mode are estimated using the histogram, and so 
    for larger datatypes, they will be approximate only. 
    
    For thematic layers, the histogram is calculated with as many bins 
    as required, for athematic integer and float types, a maximum
    of 256 bins is used. 
    
    """
    progress.setLabelText("Computing Statistics...")
    progress.setProgress(0)
    percent = 0
    percentstep = 100.0 / (ds.RasterCount * 2) # 2 steps for each layer

    # flush the cache. The ensures that any unwritten data is 
    # written to file so we get the right stats. It also 
    # makes sure any metdata is written on HFA. This means
    # the LAYER_TYPE setting will be picked up by rat.SetLinearBinning()
    ds.FlushCache()
  
    for bandnum in range(ds.RasterCount):
        band = ds.GetRasterBand(bandnum + 1)

        # fill in the metadata
        tmpmeta = band.GetMetadata()
    
        if ignore is not None:
            # tell QGIS that the ignore value was ignored
            band.SetNoDataValue(ignore)
            tmpmeta["STATISTICS_EXCLUDEDVALUES"] = repr(ignore) # doesn't seem to do anything
      
        # get GDAL to calculate statistics - force recalculation. Trap errors 
        useExceptions = gdal.GetUseExceptions()
        gdal.UseExceptions()
        try:
            (minval,maxval,meanval,stddevval) = band.ComputeStatistics(False)
        except RuntimeError as e:
            if str(e).endswith('Failed to compute statistics, no valid pixels found in sampling.'):
                minval = ignore
                maxval = ignore
                meanval = ignore
                stddevval = 0
            else:
                raise e
        if not useExceptions:
            gdal.DontUseExceptions()

        percent = percent + percentstep
        progress.setProgress(percent)
    
        tmpmeta["STATISTICS_MINIMUM"] = repr(minval)
        tmpmeta["STATISTICS_MAXIMUM"] = repr(maxval)
        tmpmeta["STATISTICS_MEAN"]    = repr(meanval)
        tmpmeta["STATISTICS_STDDEV"]  = repr(stddevval)
        # because we did at full res - these are the default anyway
        tmpmeta["STATISTICS_SKIPFACTORX"] = "1"
        tmpmeta["STATISTICS_SKIPFACTORY"] = "1"

        # create a histogram so we can do the mode and median
        if band.DataType == gdal.GDT_Byte:
            # if byte data use 256 bins and the whole range
            histmin = 0
            histmax = 255
            histstep = 1.0
            histCalcMin = -0.5
            histCalcMax = 255.5
            histnbins = 256
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
        elif "LAYER_TYPE" in tmpmeta and tmpmeta["LAYER_TYPE"] == 'thematic':
            # all other thematic types a bin per value
            histmin = 0
            histmax = int(numpy.ceil(maxval))
            histstep = 1.0
            histCalcMin = -0.5
            histCalcMax = maxval + 0.5
            histnbins = histmax + 1
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
        elif band.DataType in gdalLargeIntTypes:
            histrange = int(numpy.ceil(maxval) - numpy.floor(minval)) + 1
            (histmin, histmax) = (minval, maxval)
            if histrange <= 256:
                histnbins = histrange
                histstep = 1.0
                tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
                histCalcMin = histmin - 0.5
                histCalcMax = histmax + 0.5
            else:
                histnbins = 256
                tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear'
                histCalcMin = histmin
                histCalcMax = histmax
                histstep = float(histCalcMax - histCalcMin) / histnbins
        elif band.DataType in gdalFloatTypes:
            histnbins = 256
            (histmin, histmax) = (minval, maxval)
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear'
            histCalcMin = minval
            histCalcMax = maxval
            histstep = float(histCalcMax - histCalcMin) / histnbins
        # Note that the complex number data types are not handled, as I am not sure
        # what a histogram or a median would mean for such types. 
      
        userdata = ProgressUserData()
        userdata.progress = progress
        userdata.nbands = ds.RasterCount * 2
        userdata.curroffset = percent
      
        # get histogram and force GDAL to recalculate it
        hist = band.GetHistogram(histCalcMin, histCalcMax, histnbins, False, 
                        False, progressFunc, userdata)
        
        # Check if GDAL's histogram code overflowed. This is not a fool-proof test,
        # as some overflows will not result in negative counts. 
        histogramOverflow = (min(hist) < 0)
        
        # we may use this ratObj reference for the colours below also
        # may be None if format does not support RATs
        ratObj = band.GetDefaultRAT()

        if not histogramOverflow:
            # comes back as a list for some reason
            hist = numpy.array(hist)

            # Note that we have explicitly set histstep in each datatype case 
            # above. In principle, this can be calculated, as it is done in the 
            # float case, but for some of the others we need it to be exactly
            # equal to 1, so we set it explicitly there, to avoid rounding
            # error problems. 

            # do the mode - bin with the highest count
            modebin = numpy.argmax(hist)
            modeval = modebin * histstep + histmin
            if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64:
                tmpmeta["STATISTICS_MODE"] = repr(modeval)
            else:
                tmpmeta["STATISTICS_MODE"] = repr(int(round(modeval)))

            if haveRFC40 and ratObj is not None:
                histIndx, histNew = findOrCreateColumn(ratObj, gdal.GFU_PixelCount, 
                                        "Histogram", gdal.GFT_Real)
                # write the hist in a single go
                ratObj.SetRowCount(histnbins)
                ratObj.WriteArray(hist, histIndx)

                # Use SetLinearBinning function if it has been fixed
                # in the current version of GDAL
                if haveLinearBinningFix:
                    ratObj.SetLinearBinning(histmin, (histCalcMax - histCalcMin) / histnbins)
                else:
                    tmpmeta["STATISTICS_HISTOMIN"] = repr(histmin)
                    tmpmeta["STATISTICS_HISTOMAX"] = repr(histmax)
                    tmpmeta["STATISTICS_HISTONUMBINS"] = repr(histnbins)

                # The HFA driver still honours the STATISTICS_HISTOBINVALUES
                # metadata item. If we are recalculating the histogram the old
                # values will be copied across with the metadata so clobber it
                if "STATISTICS_HISTOBINVALUES" in tmpmeta:
                    del tmpmeta["STATISTICS_HISTOBINVALUES"]
            else:
                # old method
                tmpmeta["STATISTICS_HISTOBINVALUES"] = '|'.join(map(repr,hist)) + '|'

                tmpmeta["STATISTICS_HISTOMIN"] = repr(histmin)
                tmpmeta["STATISTICS_HISTOMAX"] = repr(histmax)
                tmpmeta["STATISTICS_HISTONUMBINS"] = repr(histnbins)


            # estimate the median - bin with the middle number
            middlenum = hist.sum() / 2
            gtmiddle = hist.cumsum() >= middlenum
            medianbin = gtmiddle.nonzero()[0][0]
            medianval = medianbin * histstep + histmin
            if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64:
                tmpmeta["STATISTICS_MEDIAN"]  = repr(medianval)
            else:
                tmpmeta["STATISTICS_MEDIAN"]  = repr(int(round(medianval)))
    
        # set the data
        band.SetMetadata(tmpmeta)

        if haveRFC40 and ratObj is not None and not ratObj.ChangesAreWrittenToFile():
            # For drivers that require the in memory thing
            band.SetDefaultRAT(ratObj)

        percent = percent + percentstep
        progress.setProgress(percent)

        if progress.wasCancelled():
            raise ProcessCancelledError()
    
    progress.setProgress(100)
    
    
[docs]def calcStats(ds,progress=None,ignore=None,
        minoverviewdim=DEFAULT_MINOVERVIEWDIM, 
        levels=DEFAULT_OVERVIEWLEVELS,
        aggregationType=None):
    """
    Does both the stats and pyramid layers. Calls addStatistics()
    and addPyramid() functions. See their docstrings for details. 
    
    """
    if progress is None:
        progress = cuiprogress.SilentProgress()
        
    addStatistics(ds,progress,ignore)
    
    addPyramid(ds, progress, minoverviewdim=minoverviewdim, levels=levels, 
        aggregationType=aggregationType)