These steps are meant to be used on a table level of a resource. This includes various different operations from simple validation or writing to the disc to complex re-shaping like pivoting or melting.
Group rows under the given group_name then apply aggregation functions provided as aggregation dictionary (see example)
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform-groups.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_aggregate(
group_name="name", aggregation={"sum": ("population", sum)}
),
],
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'name', 'type': 'string'}, {'name': 'sum', 'type': 'any'}]}
+-----------+-----+
| name | sum |
+===========+=====+
| 'france' | 120 |
+-----------+-----+
| 'germany' | 160 |
+-----------+-----+
| 'spain' | 80 |
+-----------+-----+
Aggregate table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, aggregation: Dict[str, Any], group_name: str) -> None
A dictionary with aggregation function. The values could be max, min, len and sum.
Dict[str, Any]
Field by which the rows will be grouped.
str
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_attach(resource=Resource(data=[["note"], ["large"], ["mid"]])),
],
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'},
{'name': 'note', 'type': 'string'}]}
+----+-----------+------------+---------+
| id | name | population | note |
+====+===========+============+=========+
| 1 | 'germany' | 83 | 'large' |
+----+-----------+------------+---------+
| 2 | 'france' | 66 | 'mid' |
+----+-----------+------------+---------+
| 3 | 'spain' | 47 | None |
+----+-----------+------------+---------+
Attach table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, resource: Union[Resource, str]) -> None
Data Resource to attach to the existing table.
Union[Resource, str]
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_debug(function=print),
],
)
print(target.to_view())
{'id': 1, 'name': 'germany', 'population': 83}
{'id': 2, 'name': 'france', 'population': 66}
{'id': 3, 'name': 'spain', 'population': 47}
+----+-----------+------------+
| id | name | population |
+====+===========+============+
| 1 | 'germany' | 83 |
+----+-----------+------------+
| 2 | 'france' | 66 |
+----+-----------+------------+
| 3 | 'spain' | 47 |
+----+-----------+------------+
Debug table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, function: Any) -> None
Debug function to apply to the table row.
Any
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_diff(
resource=Resource(
data=[
["id", "name", "population"],
[1, "germany", 83],
[2, "france", 50],
[3, "spain", 47],
]
)
),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'}]}
+----+----------+------------+
| id | name | population |
+====+==========+============+
| 2 | 'france' | 66 |
+----+----------+------------+
Diff tables. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, resource: Union[Resource, str], ignore_order: bool = False, use_hash: bool = False) -> None
Resource with which to compare.
Union[Resource, str]
Specifies whether to ignore the order of the rows.
bool
Specifies whether to use hash or not. If yes, alternative implementation will be used where the complement is executed by constructing an in-memory set for all rows found in the right hand table. For more information please see the link below: https://petl.readthedocs.io/en/stable/transform.html#petl.transform.setops.hashcomplement
bool
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_intersect(
resource=Resource(
data=[
["id", "name", "population"],
[1, "germany", 83],
[2, "france", 50],
[3, "spain", 47],
]
),
),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'}]}
+----+-----------+------------+
| id | name | population |
+====+===========+============+
| 1 | 'germany' | 83 |
+----+-----------+------------+
| 3 | 'spain' | 47 |
+----+-----------+------------+
Intersect tables. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, resource: Union[Resource, str], use_hash: bool = False) -> None
Resource with which to apply intersection.
Union[Resource, str]
Specifies whether to use hash or not. If yes, an alternative implementation will be used. For more information please see the link below: https://petl.readthedocs.io/en/stable/transform.html#petl.transform.setops.hashintersection
bool
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_join(
resource=Resource(data=[["id", "note"], [1, "beer"], [2, "vine"]]),
field_name="id",
),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'},
{'name': 'note', 'type': 'string'}]}
+----+-----------+------------+--------+
| id | name | population | note |
+====+===========+============+========+
| 1 | 'germany' | 83 | 'beer' |
+----+-----------+------------+--------+
| 2 | 'france' | 66 | 'vine' |
+----+-----------+------------+--------+
Join tables. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, resource: Union[Resource, str], field_name: Optional[str] = None, use_hash: bool = False, mode: str = inner) -> None
Resource with which to apply join.
Union[Resource, str]
Field name with which the join will be performed comparing it's value between two tables. If not provided natural join is tried. For more information, please see the following document: https://petl.readthedocs.io/en/stable/_modules/petl/transform/joins.html
Optional[str]
Specify whether to use hash or not. If True, an alternative implementation of join will be used.
bool
Specifies which mode to use. The available modes are: "inner", "left", "right", "outer", "cross" and "negate". The default mode is "inner".
str
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_melt(field_name="name"),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'name', 'type': 'string'},
{'name': 'variable', 'type': 'string'},
{'name': 'value', 'type': 'any'}]}
+-----------+--------------+-------+
| name | variable | value |
+===========+==============+=======+
| 'germany' | 'id' | 1 |
+-----------+--------------+-------+
| 'germany' | 'population' | 83 |
+-----------+--------------+-------+
| 'france' | 'id' | 2 |
+-----------+--------------+-------+
| 'france' | 'population' | 66 |
+-----------+--------------+-------+
| 'spain' | 'id' | 3 |
+-----------+--------------+-------+
...
Melt tables. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, field_name: str, variables: Optional[str] = None, to_field_names: List[str] = NOTHING) -> None
Field name which will be use to melt table. It will keep the field 'field_name' as it is but melt other fields into data.
str
List of name of fields which will be melted into data.
Optional[str]
Labels for new fields that will be created "variable" and "value".
List[str]
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_merge(
resource=Resource(data=[["id", "name", "note"], [4, "malta", "island"]])
),
]
)
print(target.schema)
print(target.to_view())
Merge tables. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, resource: Union[Resource, str], field_names: List[str] = NOTHING, sort_by_field: Optional[str] = None, ignore_fields: bool = False) -> None
Resource to merge with.
Union[Resource, str]
Specifies fixed headers for output table.
List[str]
Field name by which to sort the record after merging.
Optional[str]
If ignore_fields is set to True, it will merge two resource without matching headers.
bool
The table_normalize
step normalizes an underlaying tabular stream (cast types and fix dimensions) according to a provided or inferred schema. If your data is not really big it's recommended to normalize a table before any others steps.
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource("table.csv")
print(source.read_cells())
target = transform(
source,
steps=[
steps.table_normalize(),
]
)
print(target.read_cells())
[['id', 'name'], ['1', 'english'], ['2', 'ä¸å›½äºº']]
[['id', 'name'], [1, 'english'], [2, 'ä¸å›½äºº']]
Normalize table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None) -> None
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform-pivot.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_pivot(f1="region", f2="gender", f3="units", aggfun=sum),
]
)
print(target.schema)
print(target.to_view())
Pivot table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, f1: str, f2: str, f3: str, aggfun: Any) -> None
Field that makes the rows in the output pivot table.
str
Field that makes the columns in the output pivot table.
str
Field that forms the data in the output pivot table.
str
Function to process and create data in the output pivot table. The function can be "sum", "max", "min", "len" etc.
Any
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_print(),
]
)
== ======= ==========
id name population
== ======= ==========
1 germany 83
2 france 66
3 spain 47
== ======= ==========
Print table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None) -> None
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_melt(field_name="id"),
steps.table_recast(field_name="id"),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'}]}
+----+-----------+------------+
| id | name | population |
+====+===========+============+
| 1 | 'germany' | 83 |
+----+-----------+------------+
| 2 | 'france' | 66 |
+----+-----------+------------+
| 3 | 'spain' | 47 |
+----+-----------+------------+
Recast table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, field_name: str, from_field_names: List[str] = NOTHING) -> None
Recast table by the field 'field_name'.
str
List of field names for the output table.
List[str]
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_normalize(),
steps.table_transpose(),
]
)
print(target.schema)
print(target.to_view())
{'fields': [{'name': 'id', 'type': 'string'},
{'name': '1', 'type': 'any'},
{'name': '2', 'type': 'any'},
{'name': '3', 'type': 'any'}]}
+--------------+-----------+----------+---------+
| id | 1 | 2 | 3 |
+==============+===========+==========+=========+
| 'name' | 'germany' | 'france' | 'spain' |
+--------------+-----------+----------+---------+
| 'population' | 83 | 66 | 47 |
+--------------+-----------+----------+---------+
Transpose table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None) -> None
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.cell_set(field_name="population", value="bad"),
steps.table_validate(),
]
)
pprint(target.schema)
try:
pprint(target.to_view())
except Exception as exception:
pprint(exception)
{'fields': [{'name': 'id', 'type': 'integer'},
{'name': 'name', 'type': 'string'},
{'name': 'population', 'type': 'integer'}]}
FrictionlessException('[step-error] Step is not valid: "table_validate" raises "[type-error] Type error in the cell "bad" in row "2" and field "population" at position "3": type is "integer/default" " ')
Validate table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None) -> None
from pprint import pprint
from frictionless import Package, Resource, transform, steps
source = Resource(path="transform.csv")
target = transform(
source,
steps=[
steps.table_write(path='transform.json'),
]
)
Let's read the output:
cat transform.json
[
[
"id",
"name",
"population"
],
[
1,
"germany",
83
],
[
2,
"france",
66
],
[
3,
"spain",
47
]
]
with open('transform.json') as file:
print(file.read())
[
[
"id",
"name",
"population"
],
[
1,
"germany",
83
],
[
2,
"france",
66
],
[
3,
"spain",
47
]
]
Write table. This step can be added using the `steps` parameter for the `transform` function.
(*, name: Optional[str] = None, title: Optional[str] = None, description: Optional[str] = None, path: str) -> None
Path of the file to write the table content.
str