Source code for pandas_genomics.sim.random_gt

from typing import List, Union

import numpy as np

from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
from pandas_genomics.scalars import Variant, MISSING_IDX


[docs]def generate_random_gt( variant: Variant, alt_allele_freq: Union[List[float], float], n: int = 1000, random_seed: int = 1855, ) -> GenotypeArray: """ Simulate random genotypes according to the provided allele frequencies Parameters ---------- variant: Variant alt_allele_freq: float or List[float] Allele frequencies for each alternate allele in the variant (Bialleleic variants may specify a single float value) n: int, default 1000 How many genotypes to simulate random_seed: int, default 1855 Returns ------- GenotypeArray """ # Validate frequencies if isinstance(alt_allele_freq, float): # Convert it into a list alt_allele_freq = [ alt_allele_freq, ] if len(alt_allele_freq) != len(variant.alleles) - 1: raise ValueError( f"The number of provided frequencies ({len(alt_allele_freq)}) doesn't match" f" the number of alternate alleles in the variant ({len(variant.alleles)-1})." ) if sum(alt_allele_freq) > 1.0: raise ValueError( f"The provided frequencies must not sum to > 1.0 (sum was {sum(alt_allele_freq):.3e})" ) # Set remaining odds to the reference allele allele_freq = [ 1 - sum(alt_allele_freq), ] + alt_allele_freq # Choose gts np.random.seed(random_seed) genotypes = np.random.choice( range(len(variant.alleles)), p=allele_freq, size=(n, variant.ploidy) ) # Create GenotypeArray representation of the data dtype = GenotypeDtype(variant) scores = np.ones(n) * MISSING_IDX data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type) gt_array = GenotypeArray(values=data, dtype=dtype) return gt_array