Source code for dscript.foldseek

import biotite.structure as struc
import biotite.structure.alphabet as alphabet
import biotite.structure.io as strucio
import torch
from loguru import logger

from .utils import log

fold_vocab = {
    "D": 0,
    "P": 1,
    "V": 2,
    "Q": 3,
    "A": 4,
    "W": 5,
    "K": 6,
    "E": 7,
    "I": 8,
    "T": 9,
    "L": 10,
    "F": 11,
    "G": 12,
    "S": 13,
    "M": 14,
    "H": 15,
    "C": 16,
    "R": 17,
    "Y": 18,
    "N": 19,
    "X": 20,
}


[docs]def get_foldseek_onehot(n0, size_n0, fold_record, fold_vocab): """ fold_record is just a dictionary {ensembl_gene_name => foldseek_sequence} """ if n0 in fold_record: fold_seq = fold_record[n0] assert size_n0 == len(fold_seq) foldseek_enc = torch.zeros(size_n0, len(fold_vocab), dtype=torch.float32) for i, a in enumerate(fold_seq): assert a in fold_vocab foldseek_enc[i, fold_vocab[a]] = 1 return foldseek_enc else: return torch.zeros(size_n0, len(fold_vocab), dtype=torch.float32)
[docs]def get_3di_sequences(pdb_files: list[str]): """ Extract 3Di sequences from PDB/mmCIF files using biotite.structure.alphabet.to_3di(atoms). Returns a dict {basename: SeqRecord}. At this time, this function will only extract a 3Di sequence for the first chain in each PDB file. If you need to extract multiple chains, you will need to modify this function. This is to maintain consistent naming support with the rest of D-SCRIPT training and inference scripts, as the current requirement is that pdb file names match fasta header names. """ seq_records = {} for pdb_path in pdb_files: basename = str(pdb_path).split("/")[-1].split(".")[0] try: atoms = strucio.load_structure(str(pdb_path)) atoms = atoms[struc.filter_amino_acids(atoms)] chains = sorted(list(set(atoms.chain_id))) first_chain = chains[0] chain_atoms = atoms[atoms.chain_id == first_chain] if len(chain_atoms) == 0: logger.warning(f"No atoms found for chain {first_chain} in {pdb_path}") seq_3di, idx = alphabet.to_3di(chain_atoms) seq_records[basename] = str(seq_3di[0]).upper() except Exception as e: log(f"Error processing {pdb_path}: {e}") continue return seq_records