Merge branch 'python/unicode_decode_errors' into 'master'

Python unicode decode errors when decrypting. See merge request matrix-org/olm!4
2019-06-22 17:06:02 +00:00 · 2019-06-22 17:06:02 +00:00 · ae38f2c5a0
commit ae38f2c5a0
parent 25662564d4 61175c969b
10 changed files with 98 additions and 19 deletions
--- a/python/Makefile
+++ b/python/Makefile
@ -43,6 +43,9 @@ test: olm-python2 olm-python3
 	PYTHONPATH=install-temp/3 python3 -m pytest --cov --cov-branch --benchmark-disable
 	rm -rf install-temp
 isort:
 	isort -y -p olm
 clean:
 	rm -rf python_olm.egg-info/ dist/ __pycache__/
 	rm -rf *.so _libolm.o
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@ -44,3 +44,24 @@ def to_bytes(string):
        return bytes(string, "utf-8")
    raise TypeError("Invalid type {}".format(type(string)))
 def to_unicode_str(byte_string, errors="replace"):
    """Turn a byte string into a unicode string.
    Should be used everywhere where the input byte string might not be trusted
    and may contain invalid unicode values.
    Args:
        byte_string (bytes): The bytestring that will be converted to a native
            string.
        errors (str, optional): The error handling scheme that should be used
            to handle unicode decode errors. Can be one of "strict" (raise an
            UnicodeDecodeError exception, "ignore" (remove the offending
            characters), "replace" (replace the offending character with
            U+FFFD), "xmlcharrefreplace" as well as any other name registered
            with codecs.register_error that can handle UnicodeEncodeErrors.
    Returns the decoded native string.
    """
    return byte_string.decode(encoding="utf-8", errors=errors)
--- a/python/olm/group_session.py
+++ b/python/olm/group_session.py
@ -33,7 +33,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
-from ._compat import URANDOM, to_bytearray, to_bytes
+from ._compat import URANDOM, to_bytearray, to_bytes, to_unicode_str
 from ._finalize import track_for_finalization
@ -176,8 +176,8 @@ class InboundGroupSession(object):
        raise OlmGroupSessionError(last_error)
-    def decrypt(self, ciphertext):
+    def decrypt(self, ciphertext, unicode_errors="replace"):
-        # type: (AnyStr) -> Tuple[str, int]
+        # type: (AnyStr, str) -> Tuple[str, int]
        """Decrypt a message
        Returns a tuple of the decrypted plain-text and the message index of
@ -197,6 +197,13 @@ class InboundGroupSession(object):
        Args:
            ciphertext(str): Base64 encoded ciphertext containing the encrypted
                message
            unicode_errors(str, optional): The error handling scheme to use for
                unicode decoding errors. The default is "replace" meaning that
                the character that was unable to decode will be replaced with
                the unicode replacement character (U+FFFD). Other possible
                values are "strict", "ignore" and "xmlcharrefreplace" as well
                as any other name registered with codecs.register_error that
                can handle UnicodeEncodeErrors.
        """
        if not ciphertext:
            raise ValueError("Ciphertext can't be empty.")
@ -223,10 +230,10 @@ class InboundGroupSession(object):
        self._check_error(plaintext_length)
-        plaintext = bytes_to_native_str(ffi.unpack(
+        plaintext = to_unicode_str(
-            plaintext_buffer,
+            ffi.unpack(plaintext_buffer, plaintext_length),
-            plaintext_length
+            errors=unicode_errors
-        ))
+        )
        # clear out copies of the plaintext
        lib.memset(plaintext_buffer, 0, max_plaintext_length)
--- a/python/olm/pk.py
+++ b/python/olm/pk.py
@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 from _libolm import ffi, lib  # type: ignore
-from ._compat import URANDOM, to_bytearray
+from ._compat import URANDOM, to_bytearray, to_unicode_str
 from ._finalize import track_for_finalization
@ -313,8 +313,8 @@ class PkDecryption(object):
        return obj
-    def decrypt(self, message):
+    def decrypt(self, message, unicode_errors="replace"):
-        # type (PkMessage) -> str
+        # type (PkMessage, str) -> str
        """Decrypt a previously encrypted Pk message.
        Returns the decrypted plaintext.
@ -322,6 +322,13 @@ class PkDecryption(object):
        Args:
            message(PkMessage): the pk message to decrypt.
            unicode_errors(str, optional): The error handling scheme to use for
                unicode decoding errors. The default is "replace" meaning that
                the character that was unable to decode will be replaced with
                the unicode replacement character (U+FFFD). Other possible
                values are "strict", "ignore" and "xmlcharrefreplace" as well
                as any other name registered with codecs.register_error that
                can handle UnicodeEncodeErrors.
        """
        ephemeral_key = to_bytearray(message.ephemeral_key)
        ephemeral_key_size = len(ephemeral_key)
@ -354,7 +361,7 @@ class PkDecryption(object):
        # clear out copies of the plaintext
        lib.memset(plaintext_buffer, 0, max_plaintext_length)
-        return bytes_to_native_str(plaintext)
+        return to_unicode_str(plaintext, errors=unicode_errors)
 def _clear_pk_signing(pk_struct):
--- a/python/olm/sas.py
+++ b/python/olm/sas.py
@ -30,15 +30,15 @@ Examples:
 """
 from functools import wraps
 from builtins import bytes
 from functools import wraps
 from typing import Optional
 from future.utils import bytes_to_native_str
 from _libolm import ffi, lib
-from ._compat import URANDOM, to_bytes, to_bytearray
+from ._compat import URANDOM, to_bytearray, to_bytes
 from ._finalize import track_for_finalization
--- a/python/olm/session.py
+++ b/python/olm/session.py
@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
-from ._compat import URANDOM, to_bytearray, to_bytes
+from ._compat import URANDOM, to_bytearray, to_bytes, to_unicode_str
 from ._finalize import track_for_finalization
 # This is imported only for type checking purposes
@ -273,8 +273,8 @@ class Session(object):
        else:  # pragma: no cover
            raise ValueError("Unknown message type")
-    def decrypt(self, message):
+    def decrypt(self, message, unicode_errors="replace"):
-        # type: (_OlmMessage) -> str
+        # type: (_OlmMessage, str) -> str
        """Decrypts a message using the session. Returns the plaintext string
        on success. Raises OlmSessionError on failure. If the base64 couldn't
        be decoded then the error message will be "INVALID_BASE64". If the
@ -285,7 +285,14 @@ class Session(object):
        Args:
            message(OlmMessage): The Olm message that will be decrypted. It can
-            be either an OlmPreKeyMessage or an OlmMessage.
+                be either an OlmPreKeyMessage or an OlmMessage.
            unicode_errors(str, optional): The error handling scheme to use for
                unicode decoding errors. The default is "replace" meaning that
                the character that was unable to decode will be replaced with
                the unicode replacement character (U+FFFD). Other possible
                values are "strict", "ignore" and "xmlcharrefreplace" as well
                as any other name registered with codecs.register_error that
                can handle UnicodeEncodeErrors.
        """
        if not message.ciphertext:
            raise ValueError("Ciphertext can't be empty")
@ -311,8 +318,10 @@ class Session(object):
            plaintext_buffer, max_plaintext_length
        )
        self._check_error(plaintext_length)
-        plaintext = bytes_to_native_str(
+        plaintext = to_unicode_str(
-            ffi.unpack(plaintext_buffer, plaintext_length))
+            ffi.unpack(plaintext_buffer, plaintext_length),
            errors=unicode_errors
        )
        # clear out copies of the plaintext
        lib.memset(plaintext_buffer, 0, max_plaintext_length)
--- a/python/olm/utility.py
+++ b/python/olm/utility.py
@ -32,6 +32,7 @@ Examples:
 # pylint: disable=redefined-builtin,unused-import
 from typing import AnyStr, Type
 from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
--- a/python/tests/group_session_test.py
+++ b/python/tests/group_session_test.py
@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 import pytest
 from olm import InboundGroupSession, OlmGroupSessionError, OutboundGroupSession
@ -112,3 +113,16 @@ class TestClass(object):
        outbound = OutboundGroupSession()
        inbound = InboundGroupSession(outbound.session_key)
        del inbound
    def test_invalid_unicode_decrypt(self):
        outbound = OutboundGroupSession()
        inbound = InboundGroupSession(outbound.session_key)
        text = outbound.encrypt(b"\xed")
        plaintext, _ = inbound.decrypt(text)
        print(plaintext)
        assert plaintext == u"<EFBFBD>"
        plaintext, _ = inbound.decrypt(text, "ignore")
        assert plaintext == ""
--- a/python/tests/pk_test.py
+++ b/python/tests/pk_test.py
@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 import pytest
 from olm import (PkDecryption, PkDecryptionError, PkEncryption, PkSigning,
@ -55,3 +56,10 @@ class TestClass(object):
        message = "This statement is true"
        signature = signing.sign(message)
        ed25519_verify(signing.public_key, message, signature)
    def test_invalid_unicode_decrypt(self):
        decryption = PkDecryption()
        encryption = PkEncryption(decryption.public_key)
        message = encryption.encrypt(b"\xed")
        plaintext = decryption.decrypt(message)
        assert plaintext == u"<EFBFBD>"
--- a/python/tests/session_test.py
+++ b/python/tests/session_test.py
@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 import pytest
 from olm import (Account, InboundSession, OlmMessage, OlmPreKeyMessage,
@ -141,3 +142,11 @@ class TestClass(object):
        new_message = new_session.encrypt(plaintext)
        assert bob_session.matches(new_message) is False
    def test_invalid_unicode_decrypt(self):
        alice, bob, session = self._create_session()
        message = session.encrypt(b"\xed")
        bob_session = InboundSession(bob, message)
        plaintext = bob_session.decrypt(message)
        assert plaintext == u"<EFBFBD>"