Source code for datasources.connectors.hypercat

"""
This module contains data connector classes for retrieving data from HyperCat catalogues.
"""

import typing

from .base import BaseDataConnector, DataCatalogueConnector, DataSetConnector, DatasetNotFoundError


[docs]class HyperCat(DataCatalogueConnector):
    """
    Data connector for retrieving data or metadata from a HyperCat catalogue.
    """
    dataset_connector_class = DataSetConnector

    def __init__(self, location: str,
                 api_key: typing.Optional[str] = None,
                 auth: typing.Optional[typing.Callable] = None):
        super().__init__(location, api_key=api_key, auth=auth)

        self._response = None

    def __getitem__(self, item: str) -> BaseDataConnector:
        params = {
            'href': item
        }

        response = self._get_response(params)

        try:
            dataset_item = self._get_item_by_key_value(
                response['items'],
                'href',
                item
            )
            metadata = dataset_item['item-metadata']

        except KeyError as e:
            raise DatasetNotFoundError(
                'Dataset {0} could not be found'.format(item)
            ) from e

        try:
            try:
                content_type = self._get_item_by_key_value(
                    metadata,
                    'rel',
                    'urn:X-hypercat:rels:isContentType'
                )['val']

            except KeyError:
                content_type = self._get_item_by_key_value(
                    metadata,
                    'rel',
                    'urn:X-hypercat:rels:containsContentType'
                )['val']

            if content_type == 'application/vnd.hypercat.catalogue+json':
                return type(self)(location=item,
                                  api_key=self.api_key,
                                  auth=self.auth)

        except (KeyError, ValueError):
            # Has no or multiple values for content type - is not a catalogue
            pass

        return self.dataset_connector_class(item, self.api_key,
                                            auth=self.auth,
                                            metadata=metadata)

[docs]    def items(self,
              params=None) -> typing.ItemsView:
        """
        Get key-value pairs of dataset ID to dataset connector for datasets contained within this catalogue.

        :param params: Query parameters to be passed through to the data source API
        :return: Dictionary ItemsView over datasets
        """
        response = self._get_response(params)

        return {
            item['href']: self.dataset_connector_class(item['href'], self.api_key,
                                                       auth=self.auth,
                                                       metadata=item['item-metadata'])
            for item in response['items']
        }.items()

    # TODO this gets the entire HyperCat contents so is slow on the BT HyperCat API - ~1s
[docs]    def get_metadata(self,
                     params: typing.Optional[typing.Mapping[str, str]] = None):
        response = self._get_response(params)

        return response['catalogue-metadata']

[docs]    def get_datasets(self,
                     params: typing.Optional[typing.Mapping[str, str]] = None) -> typing.List[str]:
        response = self._get_response(params=params)

        datasets = []
        for item in response['items']:
            datasets.append(item['href'])

        return datasets

    @staticmethod
    def _get_item_by_key_value(collection: typing.Iterable[typing.Mapping],
                               key: str, value: typing.Any) -> typing.Mapping:
        matches = [item for item in collection if item[key] == value]

        if not matches:
            raise KeyError
        elif len(matches) > 1:
            raise ValueError('Multiple items were found')

        return matches[0]

    def _get_response(self, params: typing.Optional[typing.Mapping[str, str]] = None) -> typing.Mapping:
        # Use cached response if we have one
        # TODO should we use cached responses?
        if self._response is not None and params is None:
            # Ignore params - they only filter - we already have everything
            response = self._response
        else:
            response = self._get_auth_request(self.location,
                                              params=params)
        response.raise_for_status()
        return response.json()

    def __enter__(self):
        self._response = self._get_auth_request(self.location)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass