HEX

File: //lib64/python2.7/site-packages/bson/__init__.py
# Copyright 2009-2012 10gen, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""BSON (Binary JSON) encoding and decoding.
"""

import calendar
import datetime
import re
import struct
import sys

from bson.binary import (Binary, OLD_UUID_SUBTYPE,
                         JAVA_LEGACY, CSHARP_LEGACY)
from bson.code import Code
from bson.dbref import DBRef
from bson.errors import (InvalidBSON,
                         InvalidDocument,
                         InvalidStringData)
from bson.max_key import MaxKey
from bson.min_key import MinKey
from bson.objectid import ObjectId
from bson.py3compat import b, binary_type
from bson.son import SON, RE_TYPE
from bson.timestamp import Timestamp
from bson.tz_util import utc


try:
    from bson import _cbson
    _use_c = True
except ImportError:
    _use_c = False

try:
    import uuid
    _use_uuid = True
except ImportError:
    _use_uuid = False

PY3 = sys.version_info[0] == 3


MAX_INT32 = 2147483647
MIN_INT32 = -2147483648
MAX_INT64 = 9223372036854775807
MIN_INT64 = -9223372036854775808

EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc)
EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0)

# Create constants compatible with all versions of
# python from 2.4 forward. In 2.x b("foo") is just
# "foo". In 3.x it becomes b"foo".
EMPTY = b("")
ZERO  = b("\x00")
ONE   = b("\x01")

BSONNUM = b("\x01") # Floating point
BSONSTR = b("\x02") # UTF-8 string
BSONOBJ = b("\x03") # Embedded document
BSONARR = b("\x04") # Array
BSONBIN = b("\x05") # Binary
BSONUND = b("\x06") # Undefined
BSONOID = b("\x07") # ObjectId
BSONBOO = b("\x08") # Boolean
BSONDAT = b("\x09") # UTC Datetime
BSONNUL = b("\x0A") # Null
BSONRGX = b("\x0B") # Regex
BSONREF = b("\x0C") # DBRef
BSONCOD = b("\x0D") # Javascript code
BSONSYM = b("\x0E") # Symbol
BSONCWS = b("\x0F") # Javascript code with scope
BSONINT = b("\x10") # 32bit int
BSONTIM = b("\x11") # Timestamp
BSONLON = b("\x12") # 64bit int
BSONMIN = b("\xFF") # Min key
BSONMAX = b("\x7F") # Max key


def _get_int(data, position, as_class=None,
             tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, unsigned=False):
    format = unsigned and "I" or "i"
    try:
        value = struct.unpack("<%s" % format, data[position:position + 4])[0]
    except struct.error:
        raise InvalidBSON()
    position += 4
    return value, position


def _get_c_string(data, position, length=None):
    if length is None:
        try:
            end = data.index(ZERO, position)
        except ValueError:
            raise InvalidBSON()
    else:
        end = position + length
    value = data[position:end].decode("utf-8")
    position = end + 1

    return value, position


def _make_c_string(string, check_null=False):
    if isinstance(string, unicode):
        if check_null and "\x00" in string:
            raise InvalidDocument("BSON keys / regex patterns must not "
                                  "contain a NULL character")
        return string.encode("utf-8") + ZERO
    else:
        if check_null and ZERO in string:
            raise InvalidDocument("BSON keys / regex patterns must not "
                                  "contain a NULL character")
        try:
            string.decode("utf-8")
            return string + ZERO
        except UnicodeError:
            raise InvalidStringData("strings in documents must be valid "
                                    "UTF-8: %r" % string)


def _get_number(data, position, as_class, tz_aware, uuid_subtype):
    num = struct.unpack("<d", data[position:position + 8])[0]
    position += 8
    return num, position


def _get_string(data, position, as_class, tz_aware, uuid_subtype):
    length = struct.unpack("<i", data[position:position + 4])[0] - 1
    position += 4
    return _get_c_string(data, position, length)


def _get_object(data, position, as_class, tz_aware, uuid_subtype):
    obj_size = struct.unpack("<i", data[position:position + 4])[0]
    encoded = data[position + 4:position + obj_size - 1]
    object = _elements_to_dict(encoded, as_class, tz_aware, uuid_subtype)
    position += obj_size
    if "$ref" in object:
        return (DBRef(object.pop("$ref"), object.pop("$id", None),
                      object.pop("$db", None), object), position)
    return object, position


def _get_array(data, position, as_class, tz_aware, uuid_subtype):
    obj, position = _get_object(data, position,
                                as_class, tz_aware, uuid_subtype)
    result = []
    i = 0
    while True:
        try:
            result.append(obj[str(i)])
            i += 1
        except KeyError:
            break
    return result, position


def _get_binary(data, position, as_class, tz_aware, uuid_subtype):
    length, position = _get_int(data, position)
    subtype = ord(data[position:position + 1])
    position += 1
    if subtype == 2:
        length2, position = _get_int(data, position)
        if length2 != length - 4:
            raise InvalidBSON("invalid binary (st 2) - lengths don't match!")
        length = length2
    if subtype in (3, 4) and _use_uuid:
        # Java Legacy
        if uuid_subtype == JAVA_LEGACY:
            java = data[position:position + length]
            value = uuid.UUID(bytes=java[0:8][::-1] + java[8:16][::-1])
        # C# legacy
        elif uuid_subtype == CSHARP_LEGACY:
            value = uuid.UUID(bytes_le=data[position:position + length])
        # Python
        else:
            value = uuid.UUID(bytes=data[position:position + length])
        position += length
        return (value, position)
    # Python3 special case. Decode subtype 0 to 'bytes'.
    if PY3 and subtype == 0:
        value = data[position:position + length]
    else:
        value = Binary(data[position:position + length], subtype)
    position += length
    return value, position


def _get_oid(data, position, as_class=None,
             tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE):
    value = ObjectId(data[position:position + 12])
    position += 12
    return value, position


def _get_boolean(data, position, as_class, tz_aware, uuid_subtype):
    value = data[position:position + 1] == ONE
    position += 1
    return value, position


def _get_date(data, position, as_class, tz_aware, uuid_subtype):
    millis = struct.unpack("<q", data[position:position + 8])[0]
    diff = millis % 1000
    seconds = (millis - diff) / 1000
    position += 8
    if tz_aware:
        dt = EPOCH_AWARE + datetime.timedelta(seconds=seconds)
    else:
        dt = EPOCH_NAIVE + datetime.timedelta(seconds=seconds)
    return dt.replace(microsecond=diff * 1000), position


def _get_code(data, position, as_class, tz_aware, uuid_subtype):
    code, position = _get_string(data, position,
                                 as_class, tz_aware, uuid_subtype)
    return Code(code), position


def _get_code_w_scope(data, position, as_class, tz_aware, uuid_subtype):
    _, position = _get_int(data, position)
    code, position = _get_string(data, position,
                                 as_class, tz_aware, uuid_subtype)
    scope, position = _get_object(data, position,
                                  as_class, tz_aware, uuid_subtype)
    return Code(code, scope), position


def _get_null(data, position, as_class, tz_aware, uuid_subtype):
    return None, position


def _get_regex(data, position, as_class, tz_aware, uuid_subtype):
    pattern, position = _get_c_string(data, position)
    bson_flags, position = _get_c_string(data, position)
    flags = 0
    if "i" in bson_flags:
        flags |= re.IGNORECASE
    if "l" in bson_flags:
        flags |= re.LOCALE
    if "m" in bson_flags:
        flags |= re.MULTILINE
    if "s" in bson_flags:
        flags |= re.DOTALL
    if "u" in bson_flags:
        flags |= re.UNICODE
    if "x" in bson_flags:
        flags |= re.VERBOSE
    return re.compile(pattern, flags), position


def _get_ref(data, position, as_class, tz_aware, uuid_subtype):
    position += 4
    collection, position = _get_c_string(data, position)
    oid, position = _get_oid(data, position)
    return DBRef(collection, oid), position


def _get_timestamp(data, position, as_class, tz_aware, uuid_subtype):
    inc, position = _get_int(data, position, unsigned=True)
    timestamp, position = _get_int(data, position, unsigned=True)
    return Timestamp(timestamp, inc), position


def _get_long(data, position, as_class, tz_aware, uuid_subtype):
    # Have to cast to long; on 32-bit unpack may return an int.
    # 2to3 will change long to int. That's fine since long doesn't
    # exist in python3.
    value = long(struct.unpack("<q", data[position:position + 8])[0])
    position += 8
    return value, position


_element_getter = {
    BSONNUM: _get_number,
    BSONSTR: _get_string,
    BSONOBJ: _get_object,
    BSONARR: _get_array,
    BSONBIN: _get_binary,
    BSONUND: _get_null,  # undefined
    BSONOID: _get_oid,
    BSONBOO: _get_boolean,
    BSONDAT: _get_date,
    BSONNUL: _get_null,
    BSONRGX: _get_regex,
    BSONREF: _get_ref,
    BSONCOD: _get_code,  # code
    BSONSYM: _get_string,  # symbol
    BSONCWS: _get_code_w_scope,
    BSONINT: _get_int,  # number_int
    BSONTIM: _get_timestamp,
    BSONLON: _get_long, # Same as _get_int after 2to3 runs.
    BSONMIN: lambda v, w, x, y, z: (MinKey(), w),
    BSONMAX: lambda v, w, x, y, z: (MaxKey(), w)}


def _element_to_dict(data, position, as_class, tz_aware, uuid_subtype):
    element_type = data[position:position + 1]
    position += 1
    element_name, position = _get_c_string(data, position)
    value, position = _element_getter[element_type](data, position, as_class,
                                                    tz_aware, uuid_subtype)
    return element_name, value, position


def _elements_to_dict(data, as_class, tz_aware, uuid_subtype):
    result = as_class()
    position = 0
    end = len(data) - 1
    while position < end:
        (key, value, position) = _element_to_dict(data, position, as_class,
                                                  tz_aware, uuid_subtype)
        result[key] = value
    return result

def _bson_to_dict(data, as_class, tz_aware, uuid_subtype):
    obj_size = struct.unpack("<i", data[:4])[0]
    length = len(data)
    if length < obj_size:
        raise InvalidBSON("objsize too large")
    if obj_size != length or data[obj_size - 1:obj_size] != ZERO:
        raise InvalidBSON("bad eoo")
    elements = data[4:obj_size - 1]
    return (_elements_to_dict(elements, as_class,
                              tz_aware, uuid_subtype), data[obj_size:])
if _use_c:
    _bson_to_dict = _cbson._bson_to_dict


def _element_to_bson(key, value, check_keys, uuid_subtype):
    if not isinstance(key, basestring):
        raise InvalidDocument("documents must have only string keys, "
                              "key was %r" % key)

    if check_keys:
        if key.startswith("$"):
            raise InvalidDocument("key %r must not start with '$'" % key)
        if "." in key:
            raise InvalidDocument("key %r must not contain '.'" % key)

    name = _make_c_string(key, True)
    if isinstance(value, float):
        return BSONNUM + name + struct.pack("<d", value)

    if _use_uuid:
        if isinstance(value, uuid.UUID):
            # Java Legacy
            if uuid_subtype == JAVA_LEGACY:
                # Python 3.0(.1) returns a bytearray instance for bytes (3.1
                # and newer just return a bytes instance). Convert that to
                # binary_type (here and below) for compatibility.
                from_uuid = binary_type(value.bytes)
                as_legacy_java = from_uuid[0:8][::-1] + from_uuid[8:16][::-1]
                value = Binary(as_legacy_java, subtype=OLD_UUID_SUBTYPE)
            # C# legacy
            elif uuid_subtype == CSHARP_LEGACY:
                # Microsoft GUID representation.
                value = Binary(binary_type(value.bytes_le),
                               subtype=OLD_UUID_SUBTYPE)
            # Python
            else:
                value = Binary(binary_type(value.bytes), subtype=uuid_subtype)

    if isinstance(value, Binary):
        subtype = value.subtype
        if subtype == 2:
            value = struct.pack("<i", len(value)) + value
        return (BSONBIN + name +
                struct.pack("<i", len(value)) + b(chr(subtype)) + value)
    if isinstance(value, Code):
        cstring = _make_c_string(value)
        if not value.scope:
            length = struct.pack("<i", len(cstring))
            return BSONCOD + name + length + cstring
        scope = _dict_to_bson(value.scope, False, uuid_subtype, False)
        full_length = struct.pack("<i", 8 + len(cstring) + len(scope))
        length = struct.pack("<i", len(cstring))
        return BSONCWS + name + full_length + length + cstring + scope
    if isinstance(value, binary_type):
        if PY3:
            # Python3 special case. Store 'bytes' as BSON binary subtype 0.
            return (BSONBIN + name +
                    struct.pack("<i", len(value)) + ZERO + value)
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return BSONSTR + name + length + cstring
    if isinstance(value, unicode):
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return BSONSTR + name + length + cstring
    if isinstance(value, dict):
        return BSONOBJ + name + _dict_to_bson(value, check_keys, uuid_subtype, False)
    if isinstance(value, (list, tuple)):
        as_dict = SON(zip([str(i) for i in range(len(value))], value))
        return BSONARR + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False)
    if isinstance(value, ObjectId):
        return BSONOID + name + value.binary
    if value is True:
        return BSONBOO + name + ONE
    if value is False:
        return BSONBOO + name + ZERO
    if isinstance(value, int):
        # TODO this is an ugly way to check for this...
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        if value > MAX_INT32 or value < MIN_INT32:
            return BSONLON + name + struct.pack("<q", value)
        return BSONINT + name + struct.pack("<i", value)
    # 2to3 will convert long to int here since there is no long in python3.
    # That's OK. The previous if block will match instead.
    if isinstance(value, long):
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        return BSONLON + name + struct.pack("<q", value)
    if isinstance(value, datetime.datetime):
        if value.utcoffset() is not None:
            value = value - value.utcoffset()
        millis = int(calendar.timegm(value.timetuple()) * 1000 +
                     value.microsecond / 1000)
        return BSONDAT + name + struct.pack("<q", millis)
    if isinstance(value, Timestamp):
        time = struct.pack("<I", value.time)
        inc = struct.pack("<I", value.inc)
        return BSONTIM + name + inc + time
    if value is None:
        return BSONNUL + name
    if isinstance(value, RE_TYPE):
        pattern = value.pattern
        flags = ""
        if value.flags & re.IGNORECASE:
            flags += "i"
        if value.flags & re.LOCALE:
            flags += "l"
        if value.flags & re.MULTILINE:
            flags += "m"
        if value.flags & re.DOTALL:
            flags += "s"
        if value.flags & re.UNICODE:
            flags += "u"
        if value.flags & re.VERBOSE:
            flags += "x"
        return BSONRGX + name + _make_c_string(pattern, True) + \
            _make_c_string(flags)
    if isinstance(value, DBRef):
        return _element_to_bson(key, value.as_doc(), False, uuid_subtype)
    if isinstance(value, MinKey):
        return BSONMIN + name
    if isinstance(value, MaxKey):
        return BSONMAX + name

    raise InvalidDocument("cannot convert value of type %s to bson" %
                          type(value))


def _dict_to_bson(dict, check_keys, uuid_subtype, top_level=True):
    try:
        elements = []
        if top_level and "_id" in dict:
            elements.append(_element_to_bson("_id", dict["_id"], False, uuid_subtype))
        for (key, value) in dict.iteritems():
            if not top_level or key != "_id":
                elements.append(_element_to_bson(key, value, check_keys, uuid_subtype))
    except AttributeError:
        raise TypeError("encoder expected a mapping type but got: %r" % dict)

    encoded = EMPTY.join(elements)
    length = len(encoded) + 5
    return struct.pack("<i", length) + encoded + ZERO
if _use_c:
    _dict_to_bson = _cbson._dict_to_bson



def decode_all(data, as_class=dict,
               tz_aware=True, uuid_subtype=OLD_UUID_SUBTYPE):
    """Decode BSON data to multiple documents.

    `data` must be a string of concatenated, valid, BSON-encoded
    documents.

    :Parameters:
      - `data`: BSON data
      - `as_class` (optional): the class to use for the resulting
        documents
      - `tz_aware` (optional): if ``True``, return timezone-aware
        :class:`~datetime.datetime` instances

    .. versionadded:: 1.9
    """
    docs = []
    position = 0
    end = len(data) - 1
    while position < end:
        obj_size = struct.unpack("<i", data[position:position + 4])[0]
        if len(data) - position < obj_size:
            raise InvalidBSON("objsize too large")
        if data[position + obj_size - 1:position + obj_size] != ZERO:
            raise InvalidBSON("bad eoo")
        elements = data[position + 4:position + obj_size - 1]
        position += obj_size
        docs.append(_elements_to_dict(elements, as_class,
                                      tz_aware, uuid_subtype))
    return docs
if _use_c:
    decode_all = _cbson.decode_all


def is_valid(bson):
    """Check that the given string represents valid :class:`BSON` data.

    Raises :class:`TypeError` if `bson` is not an instance of
    :class:`str` (:class:`bytes` in python 3). Returns ``True``
    if `bson` is valid :class:`BSON`, ``False`` otherwise.

    :Parameters:
      - `bson`: the data to be validated
    """
    if not isinstance(bson, binary_type):
        raise TypeError("BSON data must be an instance "
                        "of a subclass of %s" % (binary_type.__name__,))

    try:
        (_, remainder) = _bson_to_dict(bson, dict, True, OLD_UUID_SUBTYPE)
        return remainder == EMPTY
    except:
        return False


class BSON(binary_type):
    """BSON (Binary JSON) data.
    """

    @classmethod
    def encode(cls, document, check_keys=False, uuid_subtype=OLD_UUID_SUBTYPE):
        """Encode a document to a new :class:`BSON` instance.

        A document can be any mapping type (like :class:`dict`).

        Raises :class:`TypeError` if `document` is not a mapping type,
        or contains keys that are not instances of
        :class:`basestring` (:class:`str` in python 3). Raises
        :class:`~bson.errors.InvalidDocument` if `document` cannot be
        converted to :class:`BSON`.

        :Parameters:
          - `document`: mapping type representing a document
          - `check_keys` (optional): check if keys start with '$' or
            contain '.', raising :class:`~bson.errors.InvalidDocument` in
            either case

        .. versionadded:: 1.9
        """
        return cls(_dict_to_bson(document, check_keys, uuid_subtype))

    def decode(self, as_class=dict,
               tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE):
        """Decode this BSON data.

        The default type to use for the resultant document is
        :class:`dict`. Any other class that supports
        :meth:`__setitem__` can be used instead by passing it as the
        `as_class` parameter.

        If `tz_aware` is ``True`` (recommended), any
        :class:`~datetime.datetime` instances returned will be
        timezone-aware, with their timezone set to
        :attr:`bson.tz_util.utc`. Otherwise (default), all
        :class:`~datetime.datetime` instances will be naive (but
        contain UTC).

        :Parameters:
          - `as_class` (optional): the class to use for the resulting
            document
          - `tz_aware` (optional): if ``True``, return timezone-aware
            :class:`~datetime.datetime` instances

        .. versionadded:: 1.9
        """
        (document, _) = _bson_to_dict(self, as_class, tz_aware, uuid_subtype)
        return document


def has_c():
    """Is the C extension installed?

    .. versionadded:: 1.9
    """
    return _use_c


def has_uuid():
    """Is the uuid module available?

    .. versionadded:: 2.3
    """
    return _use_uuid