Source code for pandas_genomics.io.vcf

from pathlib import Path
from typing import Union

import pandas as pd
import numpy as np

from ..arrays import GenotypeArray, GenotypeDtype
from ..scalars import Variant, MISSING_IDX, Genotype


[docs]def from_vcf( filename: Union[str, Path], min_qual: float = 0, drop_filtered: bool = True ): """ Load genetic data from a VCF or BCF file into a DataFrame Parameters ---------- filename: str or Path vcf, vcf.gz, or bcf file. min_qual: float (default = 0) Skip loading variants with less than this quality drop_filtered: boolean (default = True) Skip loading variants with a FILTER value other than "PASS" Returns ------- DataFrame Columns correspond to variants (named as {variant_number}_{variant ID}). Rows correspond to samples and index columns include sample information. Examples -------- """ from cyvcf2 import VCF # Import here since installing htslib on Windows is tricky genotype_array_dict = dict() for var_num, vcf_variant in enumerate(VCF(filename)): # or VCF('some.bcf') # Skip filtered variants unless drop_filtered is False if vcf_variant.FILTER is not None and drop_filtered: continue # Skip variants below the minimum quality if vcf_variant.QUAL < min_qual: continue if len(vcf_variant.ALT) >= MISSING_IDX: raise ValueError( f"Could not load {vcf_variant.ID} due to too many ALT alleles" f" ({len(vcf_variant.ALT)} > {MISSING_IDX-1})" ) # Make variant variant = Variant( chromosome=vcf_variant.CHROM, position=vcf_variant.start, id=vcf_variant.ID, ref=vcf_variant.REF, alt=vcf_variant.ALT, ploidy=vcf_variant.ploidy, score=int(vcf_variant.QUAL), ) dtype = GenotypeDtype(variant) # Collect genotypes allele_idxs = np.array(vcf_variant.genotypes)[:, :2] allele_idxs = np.where(allele_idxs == -1, MISSING_IDX, allele_idxs) gt_scores = vcf_variant.gt_quals # Convert genotype scores from float values to uint8 values gt_scores = np.where(gt_scores > 254, 254, gt_scores) # Max Score gt_scores = np.where(gt_scores < 0, 255, gt_scores) # Min Score (<0 is missing) gt_scores = np.where(gt_scores == -1, 255, gt_scores) # Missing values gt_scores = gt_scores.round().astype("uint8") values = np.array(list(zip(allele_idxs, gt_scores)), dtype=dtype._record_type) # Make the GenotypeArray gt_array = GenotypeArray(values=values, dtype=dtype) # Make the variant name if gt_array.variant.id is None: var_name = f"Variant_{var_num}" else: var_name = gt_array.variant.id # Save to the dict genotype_array_dict[var_name] = gt_array df = pd.DataFrame.from_dict(genotype_array_dict) return df