common.libs.pyarrow
remove_null_columns
def remove_null_columns(item: TAnyArrowItem) -> TAnyArrowItem
Remove all columns of datatype pyarrow.null() from the table or record batch
remove_columns
def remove_columns(item: TAnyArrowItem,
columns: Sequence[str]) -> TAnyArrowItem
Remove columns
from Arrow item
append_column
def append_column(item: TAnyArrowItem, name: str, data: Any) -> TAnyArrowItem
Appends new column to Table or RecordBatch
rename_columns
def rename_columns(item: TAnyArrowItem,
new_column_names: Sequence[str]) -> TAnyArrowItem
Rename arrow columns on Table or RecordBatch, returns same data but with renamed schema
normalize_py_arrow_schema
def normalize_py_arrow_schema(
item: TAnyArrowItem, columns: TTableSchemaColumns,
naming: NamingConvention,
caps: DestinationCapabilitiesContext) -> TAnyArrowItem
Normalize arrow item
schema according to the columns
.
- arrow schema field names will be normalized according to
naming
- arrows columns will be reordered according to
columns
- empty columns will be inserted if they are missing, types will be generated using
caps
get_normalized_arrow_fields_mapping
def get_normalized_arrow_fields_mapping(item: TAnyArrowItem,
naming: NamingConvention) -> StrStr
Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes
py_arrow_to_table_schema_columns
def py_arrow_to_table_schema_columns(
schema: pyarrow.Schema) -> TTableSchemaColumns
Convert a PyArrow schema to a table schema columns dict.
Arguments:
schema
pyarrow.Schema - pyarrow schema
Returns:
TTableSchemaColumns
- table schema columns
get_row_count
def get_row_count(parquet_file: TFileOrPath) -> int
Get the number of rows in a parquet file.
Arguments:
parquet_file
str - path to parquet file
Returns:
int
- number of rows
to_arrow_scalar
def to_arrow_scalar(value: Any, arrow_type: pyarrow.DataType) -> Any
Converts python value to an arrow compute friendly version
from_arrow_scalar
def from_arrow_scalar(arrow_value: pyarrow.Scalar) -> Any
Converts arrow scalar into Python type. Currently adds "UTC" to naive date times and converts all others to UTC
TNewColumns
Sequence of tuples: (field index, field, generating function)
pq_stream_with_new_columns
def pq_stream_with_new_columns(
parquet_file: TFileOrPath,
columns: TNewColumns,
row_groups_per_read: int = 1) -> Iterator[pyarrow.Table]
Add column(s) to the table in batches.
The table is read from parquet row_groups_per_read
row groups at a time
Arguments:
parquet_file
- path or file object to parquet filecolumns
- list of columns to add in the form of (insertion index,pyarrow.Field
, column_value_callback) The callback should accept apyarrow.Table
and return an array of values for the column.row_groups_per_read
- number of row groups to read at a time. Defaults to 1.
Yields:
pyarrow.Table
objects with the new columns added.