Source code for hail.experimental.tidyr

import hail as hl
from hail.table import Table
from hail.typecheck import nullable, oneof, sequenceof, typecheck
from hail.utils import new_temp_file, wrap_to_list


[docs]@typecheck(ht=Table, key=str, value=str, fields=str) def gather(ht, key, value, *fields) -> Table: """Collapse fields into key-value pairs. :func:`.gather` mimics the functionality of the `gather()` function found in R's ``tidyr`` package. This is a way to turn "wide" format data into "long" format data. Parameters ---------- ht : :class:`.Table` A Hail table. key : :class:`str` The name of the key field in the gathered table. value : :class:`str` The name of the value field in the gathered table. fields : variable-length args of obj:`str` Names of fields to gather in ``ht``. Returns ------- :class:`.Table` Table with original ``fields`` gathered into ``key`` and ``value`` fields.""" ht = ht.annotate(_col_val=hl.array([hl.struct(field_name=field, value=ht[field]) for field in fields])) ht = ht.drop(*fields) ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp)
[docs]@typecheck(ht=Table, field=str, value=str, key=nullable(oneof(str, sequenceof(str)))) def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``. Parameters ---------- ht : :class:`.Table` A Hail table. field : :class:`str` The name of the factor field in `ht`. value : :class:`str` The name of the value field in `ht`. key : optional, obj:`str` or list of :class:`str` The name of any fields to group by, in addition to the row key fields of ``ht``. Returns ------- :class:`.Table` Table with original ``key`` and ``value`` fields spread across multiple columns.""" if key is None: key = list(ht.key) else: key = wrap_to_list(key) key = list(ht.key) + key field_vals = list(ht.aggregate(hl.agg.collect_as_set(ht[field]))) ht = ht.group_by(*key).aggregate( **{rv: hl.agg.take(ht[rv], 1)[0] for rv in ht.row_value if rv not in set([*key, field, value])}, **{ fv: hl.agg.filter( ht[field] == fv, hl.rbind(hl.agg.take(ht[value], 1), lambda take: hl.if_else(hl.len(take) > 0, take[0], 'NA')), ) for fv in field_vals }, ) ht_tmp = new_temp_file() ht.write(ht_tmp) return ht
[docs]@typecheck(ht=Table, field=str, into=sequenceof(str), delim=oneof(str, int)) def separate(ht, field, into, delim) -> Table: """Separate a field into multiple fields by splitting on a delimiter character or position. :func:`.separate` mimics the functionality of the `separate()` function in R's ``tidyr`` package. This function will create a new table where ``field`` has been split into multiple new fields, whose names are given by ``into``. If ``delim`` is a ``str`` (including regular expression strings), ``field`` will be separated into columns by that string. In this case, the length of ``into`` must match the number of resulting fields. If ``delim`` is an ``int``, ``field`` will be separated into two row fields, where the first field contains the first ``delim`` characters of ``field`` and the second field contains the remaining characters. Parameters ---------- ht : :class:`.Table` A Hail table. field : :class:`str` The name of the field to separate in ``ht``. into : list of :class:`str` The names of the fields to create by separating ``field``. delimiter : :class:`str` or :obj:`int` The character or position by which to separate ``field``. Returns ------- :class:`.Table` Table with original ``field`` split into fields whose names are defined by `into`.""" if isinstance(delim, int): ht = ht.annotate(**{into[0]: ht[field][:delim], into[1]: ht[field][delim:]}) else: split = ht[field].split(delim) ht = ht.annotate(**{into[i]: split[i] for i in range(len(into))}) ht = ht.drop(field) ht_tmp = new_temp_file() ht.write(ht_tmp) return ht