Python marshmallow 库

前言

使用 Python 经常需要将一些数据序列化存储到本地
同时又想要反序列化将本地的 json 数据转换为对象。
通常的解决方案是使用数据库的 orm 方案，用 orm 对象来同步数据库。
数据全部附着在 orm 上，当 orm 上的数据改变时直接修改到数据库上。

但是在我的工作使用场景中，Data Centric 的流程更为推崇，因此输出一个 json 文件会更好一点。
那么 marshmallow 库就是一个很不错的选项。

另外这个库可以和之前提到的 attrs 库可以结合使用。文章

Github 地址
 官方说明文档

什么是序列化什么是 orm

序列化就是将代码对象转换为纯数据进行存储
反序列化就是将纯数据重新转换为代码对象
代码对象可以拥有特定的方法，可以直接触发对数据的处理。

orm 全称是 Object-relational Mappers
通常是一个定义了对象实例化规则的类。
通过操作这个类的实例就可以用代码的方式将数据进行互相转换。

上面的图片就是传统 orm 实现的效果，可以用 orm 对象来执行 sql 语句从而简化数据库同步的操作，同时也增加了代码的安全性。
这个操作实现了内存到硬盘桥梁，管理更加清晰方便。

marshmallow 介绍

marshmallow 基本用法

和其他 orm 库一样，marshmallow 需要定义 Schema 类作为数据约束。

import attr
@attr.s
class Album(object):
    title = attr.ib()
    artist = attr.ib()
    
@attr.s
class Artist(object):
    name = attr.ib()

# NOTE 生成 Python 对象
bowie = Artist(name="David Bowie")
album = Album(artist=bowie, title="Hunky Dory")

from marshmallow import Schema, fields

# NOTE 定义 Schema 来约束数据转换
class ArtistSchema(Schema):
    name = fields.Str()

class AlbumSchema(Schema):
    title = fields.Str()
    artist = fields.Nested(ArtistSchema())

# NOTE 通过 Schema 将对象转换为字典
schema = AlbumSchema()
result = schema.dump(album)

print(type(result))  # <class 'dict'>
print(result)  # {'artist': {'name': 'David Bowie'}, 'title': 'Hunky Dory'}

result = schema.dumps(album)
print(type(result))  # <class 'str'>
print(result)   # '{"artist": {"name": "David Bowie"}, "title": "Hunky Dory"}'

album = schema.loads(result)
print(type(album))  # <class 'dict'>
print(album)  # {'artist': {'name': 'David Bowie'}, 'title': 'Hunky Dory'}

通过 Schema 定义好数据对象的转换方式。
dump 可以将对象数据转换为字典，dumps 则是转换为字符串
load 可以将字典转换为对象(默认是字典，需要额外的处理才可以)，loads 可以将字符串转换为对象。

反序列化为对象

import attr
from marshmallow import Schema, fields, post_load


@attr.s
class Album(object):
    title = attr.ib()
    artist = attr.ib()


@attr.s
class Artist(object):
    name = attr.ib()


class ArtistSchema(Schema):
    name = fields.Str()


class ArtistSchema(Schema):
    name = fields.Str()

    @post_load
    def make_artist(self, data, **kwargs):
        return Artist(**data)


class AlbumSchema(Schema):
    title = fields.Str()
    artist = fields.Nested(ArtistSchema())

    @post_load
    def make_album(self, data, **kwargs):
        return Album(**data)


bowie = Artist(name="David Bowie")
album = Album(artist=bowie, title="Hunky Dory")

# NOTE 通过 Schema 将对象转换为字典
schema = AlbumSchema()
result = schema.dumps(album)
album = schema.loads(result)
print(album)  # Album(title='Hunky Dory', artist=Artist(name='David Bowie'))
print(album.title)  # Hunky Dory
print(album.artist)  # Artist(name='David Bowie')
print(album.artist.name)  # David Bowie

通过加入 post_load 装饰器可以将字典数据做进一步的转换。
使用 attrs 库就不需要在 __init__ 函数中写入大量传参和初始化数据的信息了。

嵌套 Schema

官方文档

通过 fields.Nested 的方法定义嵌套的对象，从而序列化和反序列化可以复用 Schema。

import attr
from marshmallow import Schema, fields


@attr.s
class Book(Dict):
    title = attr.ib()
    author = attr.ib()


@attr.s
class Author(Dict):
    name = attr.ib()
    books = attr.ib()


potter = Book("potter", "JK")
JK = Author("JK", [potter])
potter.author = JK

class BookSchema(Schema):
    title = fields.Str()
    author = fields.Nested("AuthorSchema", only=("name",))


class AuthorSchema(Schema):
    name = fields.Str()
    books = fields.List(fields.Nested("BookSchema", exclude=("author",)))


schema = BookSchema()
res = schema.dump(potter)
print(res)  # {'title': 'potter', 'author': {'name': 'JK'}}

自定义 Field

官方文档

默认提供的 field 可能不能满足需求。
有些库的 field 需要自定义复杂的序列化和反序列化操作。
这个时候就可以定义自己的 field 来解决问题。

简单的情况可以使用 Method 和 Function 来解决问题

class UserSchema(Schema):
    name = fields.String()
    email = fields.String()
    created_at = fields.DateTime()
    since_created = fields.Method("get_days_since_created")

    def get_days_since_created(self, obj):
        return dt.datetime.now().day - obj.created_at.day

class UserSchema(Schema):
    name = fields.String()
    email = fields.String()
    created_at = fields.DateTime()
    uppername = fields.Function(lambda obj: obj.name.upper())

默认情况下是 serialize 函数，如果要自定义 deserialize 可以使用 Method 和 Function 传入 deserialize 参数进行指定。

复杂的情况就需要 fields.Field 类。

from marshmallow import fields, ValidationError


class PinCode(fields.Field):
    """Field that serializes to a string of numbers and deserializes
    to a list of numbers.
    """

    def _serialize(self, value, attr, obj, **kwargs):
        if value is None:
            return ""
        return "".join(str(d) for d in value)

    def _deserialize(self, value, attr, data, **kwargs):
        try:
            return [int(c) for c in value]
        except ValueError as error:
            raise ValidationError("Pin codes must contain only digits.") from error


class UserSchema(Schema):
    name = fields.String()
    email = fields.String()
    created_at = fields.DateTime()
    pin_code = PinCode()

踩过的坑

双向嵌套数据

如果数据存在相互嵌套引用的关系，是无法通过原生的 json 内置库进行序列化的。

import attr
import json
from addict import Dict
from marshmallow import Schema, fields


@attr.s
class Book(Dict):
    title = attr.ib()
    author = attr.ib()


@attr.s
class Author(Dict):
    name = attr.ib()
    books = attr.ib()


potter = Book("potter", "JK")
JK = Author("JK", [potter])
potter.author = JK
print(json.dumps(potter))
# Traceback (most recent call last):
#   File "f:/repo/_blog/source/_posts/Python/pacakge/02_marshmallow.py", line 22, in <module>
#     print(json.dumps(potter))
#   File "C:\tools\Anaconda3\lib\json\__init__.py", line 231, in dumps
#     return _default_encoder.encode(obj)
#   File "C:\tools\Anaconda3\lib\json\encoder.py", line 199, in encode
#     chunks = self.iterencode(o, _one_shot=True)
#   File "C:\tools\Anaconda3\lib\json\encoder.py", line 257, in iterencode
#     return _iterencode(o, 0)
# ValueError: Circular reference detected

marshmallow 则需要通过 Schema 的定义过滤掉特定的嵌套键值才可用。
并且加载数据的时候并不能还原它们原有的关联关系。
需要自己的手动去定义反序列化之后的操作。

import attr
import json
from addict import Dict
from marshmallow import Schema, fields,post_load


@attr.s
class Book(Dict):
    title = attr.ib()
    author = attr.ib(default="")


@attr.s
class Author(Dict):
    name = attr.ib()
    books = attr.ib(factory=list)


potter = Book("potter", "JK")
JK = Author("JK", [potter])
potter.author = JK

class BookSchema(Schema):
    title = fields.Str()
    author = fields.Nested("AuthorSchema", only=("name",))
    
    @post_load
    def make_object(self, data, **kwargs):
        book = Book(**data)
        if 'author' in data:
            books = book.author.books
            if book not in books:
                books.append(book)
        return book

class AuthorSchema(Schema):
    name = fields.Str()
    books = fields.List(fields.Nested("BookSchema", exclude=("author",)))

    @post_load
    def make_object(self, data, **kwargs):
        author = Author(**data)
        for book in author.books:
            book.author = author
        return author

schema = BookSchema()
res = schema.dumps(potter).data

new_potter = schema.loads(res).data
print(potter)  # Book(title='potter', author=Author(name='JK', books=[...]))
print(new_potter)  # Book(title='potter', author=Author(name='JK', books=[...]))

schema = AuthorSchema()
res = schema.dumps(JK).data
new_JK = schema.loads(res).data
print(JK)  # Author(name='JK', books=[Book(title='potter', author=...)])
print(new_JK)  # Author(name='JK', books=[Book(title='potter', author=...)])