from typing import Optional, List, Union
import pandas as pd
from .utils import calculate_edge_alphas
from pandas_genomics.arrays import GenotypeDtype
from ..scalars import Region
[docs]@pd.api.extensions.register_series_accessor("genomics")
class GenotypeSeriesAccessor:
"""
Series accessor for GenotypeArray methods
.. code-block:: python
s.genomics.variant_info
s.genomics.encode_additive()
"""
[docs] def __init__(self, obj):
if not GenotypeDtype.is_dtype(obj.dtype):
raise AttributeError(
f"Incompatible datatype ({obj.dtype}), must be a GenotypeDtype"
)
self._array = obj.array
self._index = obj.index
self._name = obj.name
def _wrap_method(self, method, *args, **kwargs):
return pd.Series(method(*args, **kwargs), self.index, name=self.name)
####################
# Variant Properties
####################
@property
def variant(self):
"""Retrieve the variant object
Returns
-------
variant: Variant
"""
return self._array.variant
@property
def variant_info(self):
"""Retrieve the variant as a pandas Series
Returns
-------
variant: pd.Series"""
return pd.Series(self._array.variant.as_dict(), name=self._name)
#######################
# Genotype Properties #
#######################
@property
def gt_scores(self):
"""Return an array of genotype scores as float values
np.nan when the score is missing
"""
return self._array.gt_scores
#########################
# Calculated Properties #
#########################
@property
def maf(self):
"""Return the minor allele frequency
See :py:attr:`GenotypeArray.maf`"""
return self._array.maf
@property
def hwe_pval(self):
"""Return the probability that the samples are in HWE
See :py:attr:`GenotypeArray.hwe_pval`"""
return self._array.hwe_pval
####################
# In-place methods #
####################
def set_reference(self, allele) -> None:
"""Change the allele reference variant.
See :meth:`GenotypeArray.set_reference`
Parameters
----------
allele: str
Must match an allele already in the variant
Returns
-------
None
"""
self._array.set_reference(allele)
############
# Encoding #
############
def encode_additive(self) -> pd.Series:
"""Additive encoding of genotypes.
See :meth:`GenotypeArray.encode_additive`
Returns
-------
pd.Series
"""
return pd.Series(
data=self._array.encode_additive(), index=self._index, name=self._name
)
def encode_dominant(self) -> pd.Series:
"""Dominant encoding of genotypes.
See :meth:`GenotypeArray.encode_dominant`
Returns
-------
pd.Series
"""
return pd.Series(
data=self._array.encode_dominant(), index=self._index, name=self._name
)
def encode_recessive(self) -> pd.Series:
"""Recessive encoding of genotypes.
See :meth:`GenotypeArray.encode_recessive`
Returns
-------
pd.Series
"""
return pd.Series(
data=self._array.encode_recessive(), index=self._index, name=self._name
)
def encode_codominant(self) -> pd.Series:
"""Codominant encoding of genotypes.
See :meth:`GenotypeArray.encode_codominant`
Returns
-------
pd.Series
"""
return pd.Series(
data=self._array.encode_codominant(), index=self._index, name=self._name
)
def encode_edge(
self,
alpha_value: float,
ref_allele: str,
alt_allele: str,
minor_allele_freq: float,
) -> pd.Series:
"""EDGE (weighted) encoding of genotypes.
See :meth:`GenotypeArray.encode_edge`
Returns
-------
pd.Series
"""
return pd.Series(
data=self._array.encode_edge(
alpha_value, ref_allele, alt_allele, minor_allele_freq
),
index=self._index,
name=self._name,
)
def calculate_edge_encoding_values(
self,
data: pd.DataFrame,
outcome_variable: str,
covariates: Optional[List[str]] = None,
):
"""
Calculate alpha values to be used in weighted encoding
Parameters
----------
data:
Data to be used in the regression, including the outcome and covariates
outcome_variable:
The variable to be used as the output (y) of the regression
covariates:
Other variables to be included in the regression formula
Returns
-------
Dict
Variant ID: str
Alpha Value - used for heterozygous genotypes
Ref Allele - which allele is considered reference
Alt Allele - which allele is considered alternate
Minor Allele Frequency - MAF of data used during calculation of alpha values
Notes
-----
See [1]_ for more information about weighted encoding.
References
----------
.. [1] Hall, Molly A., et al.
"Novel EDGE encoding method enhances ability to identify genetic interactions."
PLoS genetics 17.6 (2021): e1009534.
"""
return calculate_edge_alphas(
genotypes=pd.Series(self._array, name=self._name, index=self._index),
data=data,
outcome_variable=outcome_variable,
covariates=covariates,
)
##############
# QC Methods #
##############
# TODO
#################
# Other Methods #
#################
def contained_by(self, regions: Union[Region, List[Region]]):
"""
True if the variant is contained within the specified region(s)
Parameters
----------
regions: Region or List[Region]
Returns
-------
bool
"""
if isinstance(regions, Region):
return regions.contains_variant(self.variant)
else:
for r in regions:
if r.contains_variant(self.variant):
return True
return False