"""Contains functions for dealing with the .pdb file format."""
from datetime import datetime
import re
from itertools import groupby, chain
import valerius
from math import ceil
from .data import CODES
from .structures import Residue, Ligand
from .mmcif import add_secondary_structure_to_polymers
[docs]def pdb_string_to_pdb_dict(filestring):
"""Takes a .pdb filestring and turns into a ``dict`` which represents its
record structure. Only lines which aren't empty are used.
The resultant dictionary has line types as the keys, which point to the
lines as its value. So ``{"TITLE": ["TITLE line 1", "TITLE line 2"]}`` etc.
The exceptions are the REMARK records, where there is a sub-dictionary with
REMARK numbers as keys, and the structure records themselves which are just
arranged into lists - one for each model.
:param str filestring: the .pdb filestring to process.
:rtype: ``dict``"""
pdb_dict = {}
lines = list(filter(lambda l: bool(l.strip()), filestring.split("\n")))
lines = [[line[:6].rstrip(), line.rstrip()] for line in lines]
model_recs = ("ATOM", "HETATM", "ANISOU", "MODEL", "TER", "ENDMDL")
model = []
in_model = False
for head, line in lines:
if head == "REMARK":
if "REMARK" not in pdb_dict: pdb_dict["REMARK"] = {}
number = line.lstrip().split()[1]
update_dict(pdb_dict["REMARK"], number, line)
elif head in model_recs:
if "MODEL" not in pdb_dict: pdb_dict["MODEL"] = [[]]
if head == "ENDMDL":
pdb_dict["MODEL"].append([])
elif head != "MODEL":
pdb_dict["MODEL"][-1].append(line)
else:
update_dict(pdb_dict, head, line)
if "MODEL" in pdb_dict and not pdb_dict["MODEL"][-1]: pdb_dict["MODEL"].pop()
return pdb_dict
[docs]def update_dict(d, key, value):
"""Takes a dictionary where the values are lists, and adds a value to one of
the lists at the specific key. If the list doesn't exist, it creates it
first.
The dictionary is changed in place.
:param dict d: the dictionary to update.
:param str key: the location of the list.
:param str value: the value to add to the list."""
try:
d[key].append(value)
except: d[key] = [value]
[docs]def pdb_dict_to_data_dict(pdb_dict):
"""Converts an .pdb dictionary into an atomium data dictionary, with the
same standard layout that the other file formats get converted into.
:param dict pdb_dict: the .pdb dictionary.
:rtype: ``dict``"""
data_dict = {
"description": {
"code": None, "title": None, "deposition_date": None,
"classification": None, "keywords": [], "authors": []
}, "experiment": {
"technique": None, "source_organism": None, "expression_system": None,
"missing_residues": []
}, "quality": {"resolution": None, "rvalue": None, "rfree": None},
"geometry": {"assemblies": [], "crystallography": {}}, "models": []
}
update_description_dict(pdb_dict, data_dict)
update_experiment_dict(pdb_dict, data_dict)
update_quality_dict(pdb_dict, data_dict)
update_geometry_dict(pdb_dict, data_dict)
update_models_list(pdb_dict, data_dict)
return data_dict
[docs]def update_description_dict(pdb_dict, data_dict):
"""Creates the description component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""
extract_header(pdb_dict, data_dict["description"])
extract_title(pdb_dict, data_dict["description"])
extract_keywords(pdb_dict, data_dict["description"])
extract_authors(pdb_dict, data_dict["description"])
[docs]def update_experiment_dict(pdb_dict, data_dict):
"""Creates the experiment component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""
extract_technique(pdb_dict, data_dict["experiment"])
extract_source(pdb_dict, data_dict["experiment"])
extract_missing_residues(pdb_dict, data_dict["experiment"])
[docs]def update_quality_dict(pdb_dict, data_dict):
"""Creates the quality component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""
extract_resolution_remark(pdb_dict, data_dict["quality"])
extract_rvalue_remark(pdb_dict, data_dict["quality"])
[docs]def update_geometry_dict(pdb_dict, data_dict):
"""Creates the geometry component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""
extract_assembly_remark(pdb_dict, data_dict["geometry"])
extract_crystallography(pdb_dict, data_dict["geometry"])
[docs]def update_models_list(pdb_dict, data_dict):
"""Creates model dictionaries in a data dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""
sequences = make_sequences(pdb_dict)
secondary_structure = make_secondary_structure(pdb_dict)
full_names = get_full_names(pdb_dict)
for model_lines in pdb_dict["MODEL"]:
aniso = make_aniso(model_lines)
last_ter = get_last_ter_line(model_lines)
model = {"polymer": {}, "non-polymer": {}, "water": {}}
count = 0
for index, line in enumerate(model_lines):
if line[:6] in ["ATOM ", "HETATM"]:
chain_id = line[21] if index < last_ter else id_from_line(line)
res_id = id_from_line(line)
if index < last_ter:
add_atom_to_polymer(line, model, chain_id, res_id, aniso, full_names)
else:
add_atom_to_non_polymer(line, model, res_id, aniso, full_names)
for chain_id, chain in model["polymer"].items():
chain["sequence"] = sequences.get(chain_id, "")
add_secondary_structure_to_polymers(model, secondary_structure)
data_dict["models"].append(model)
[docs]def assembly_lines_to_assembly_dict(lines):
"""Takes the lines representing a single biological assembly and turns
them into an assembly dictionary.
:param list lines: The REMARK lines to read.
:rtype: ``dict``"""
assembly = {
"transformations": [], "software": None, "buried_surface_area": None,
"surface_area": None, "delta_energy": None, "id": 0
}
patterns = [[r"(.+)SOFTWARE USED: (.+)", "software", lambda x: x],
[r"(.+)BIOMOLECULE: (.+)", "id", int],
[r"(.+)SURFACE AREA: (.+) [A-Z]", "buried_surface_area", float],
[r"(.+)AREA OF THE COMPLEX: (.+) [A-Z]", "surface_area", float],
[r"(.+)FREE ENERGY: (.+) [A-Z]", "delta_energy", float]]
t = None
for line in lines:
for p in patterns:
matches = re.findall(p[0], line)
if matches: assembly[p[1]] = p[2](matches[0][1].strip())
if "APPLY THE FOLLOWING" in line:
if t: assembly["transformations"].append(t)
t = {"chains": [], "matrix": [], "vector": []}
if "CHAINS:" in line:
t["chains"] += [c.strip() for c in
line.split(":")[-1].strip().split(",") if c.strip()]
if "BIOMT" in line:
values = [float(x) for x in line.split()[4:]]
if len(t["matrix"]) == 3:
assembly["transformations"].append(t)
t = {"chains": t["chains"], "matrix": [], "vector": []}
t["matrix"].append(values[:3])
t["vector"].append(values[-1])
if t: assembly["transformations"].append(t)
return assembly
[docs]def make_sequences(pdb_dict):
"""Creates a mapping of chain IDs to sequences, by parsing SEQRES records.
:param dict pdb_dict: the .pdb dictionary to read.
:rtype: ``dict``"""
seq = {}
if pdb_dict.get("SEQRES"):
for line in pdb_dict["SEQRES"]:
chain, residues = line[11], line[19:].strip().split()
if chain not in seq:
seq[chain] = []
seq[chain] += residues
return {k: "".join([CODES.get(r, "X") for r in v]) for k, v in seq.items()}
[docs]def make_secondary_structure(pdb_dict):
"""Creates a dictionary of helices and strands, with each having a list of
start and end residues.
:param pdb_dict: the .pdb dict to read.
:rtype: ``dict``"""
helices, strands = [], []
for helix in pdb_dict.get("HELIX", []):
helices.append([
f"{helix[19]}.{helix[21:25].strip()}{helix[25].strip()}",
f"{helix[31]}.{helix[33:37].strip()}{helix[37].strip() if len(helix) > 37 else ''}",
])
for strand in pdb_dict.get("SHEET", []):
strands.append([
f"{strand[21]}.{strand[22:26].strip()}{strand[26].strip()}",
f"{strand[32]}.{strand[33:37].strip()}{strand[37].strip() if len(strand) > 37 else ''}",
])
return {"helices": helices, "strands": strands}
[docs]def get_full_names(pdb_dict):
"""Creates a mapping of het names to full English names.
:param pdb_dict: the .pdb dict to read.
:rtype: ``dict``"""
full_names = {}
for line in pdb_dict.get("HETNAM", []):
try:
full_names[line[11:14].strip()] += line[15:].strip()
except: full_names[line[11:14].strip()] = line[15:].strip()
return full_names
[docs]def make_aniso(model_lines):
"""Creates a mapping of chain IDs to anisotropy, by parsing ANISOU records.
:param dict pdb_dict: the .pdb dictionary to read.
:rtype: ``dict``"""
return {int(line[6:11].strip()): [
int(line[n * 7 + 28:n * 7 + 35]) / 10000 for n in range(6)
] for line in model_lines if line[:6] == "ANISOU"}
[docs]def get_last_ter_line(model_lines):
"""Gets the index of the last TER record in a list of records. 0 will be
returned if there are none.
:param list model_lines: the lines to search.
:rtype: ``int``"""
last_ter = 0
for index, line in enumerate(model_lines[::-1]):
if line[:3] == "TER":
last_ter = len(model_lines) - index - 1
break
return last_ter
[docs]def id_from_line(line):
"""Creates a residue ID from an atom line.
:param str line: the ATOM or HETATM line record.
:rtype: ``str``"""
return "{}.{}{}".format(line[21], line[22:26].strip(), line[26].strip())
[docs]def add_atom_to_polymer(line, model, chain_id, res_id, aniso_dict, full_names):
"""Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
polymer dictionary.
:param dict line: the line to read.
:param dict model: the model to update.
:param str chain_id: the chain ID to add to.
:param str res_id: the molecule ID to add to.
:param dict aniso_dict: lookup dictionary for anisotropy information."""
try:
model["polymer"][chain_id]["residues"][res_id]["atoms"][
int(line[6:11])
] = atom_line_to_dict(line, aniso_dict)
except:
name = line[17:20].strip()
try:
model["polymer"][chain_id]["residues"][res_id] = {
"name": name, "full_name": full_names.get(name),
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
"number": len(model["polymer"][chain_id]["residues"]) + 1
}
except:
model["polymer"][chain_id] = {
"internal_id": chain_id, "helices": [], "strands": [],
"residues": {res_id: {
"name": line[17:20].strip(),
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
"number": 1, "full_name": None,
}}
}
[docs]def add_atom_to_non_polymer(line, model, res_id, aniso_dict, full_names):
"""Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
non-polymer dictionary.
:param dict line: the line to read.
:param dict model: the model to update.
:param str res_id: the molecule ID to add to.
:param dict aniso_dict: lookup dictionary for anisotropy information."""
key = "water" if line[17:20] in ["HOH", "DOD"] else "non-polymer"
try:
model[key][res_id]["atoms"][
int(line[6:11])
] = atom_line_to_dict(line, aniso_dict)
except:
name = line[17:20].strip()
model[key][res_id] = {
"name": name, "full_name": full_names.get(name),
"internal_id": line[21], "polymer": line[21],
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)}
}
[docs]def atom_line_to_dict(line, aniso_dict):
"""Converts an ATOM or HETATM record to an atom dictionary.
:param str line: the record to convert.
:param dict aniso_dict: the anisotropy dictionary to use.
:rtype: ``dict``"""
a = {
"occupancy": 1, "bvalue": None, "charge": 0,
"anisotropy": aniso_dict.get(int(line[6:11].strip()), [0, 0, 0, 0, 0, 0])
}
a["is_hetatm"] = line[:6] == "HETATM"
a["name"] = line[12:16].strip() or None
a["alt_loc"] = line[16].strip() or None
a["x"] = float(line[30:38].strip())
a["y"] = float(line[38:46].strip())
a["z"] = float(line[46:54].strip())
if line[54:60].strip(): a["occupancy"] = float(line[54:60].strip())
if line[60:66].strip(): a["bvalue"] = float(line[60:66].strip())
a["element"] = line[76:78].strip() or None
if line[78:80].strip():
try:
a["charge"] = int(line[78:80].strip())
except: a["charge"] = int(line[78:80][::-1].strip())
return a
[docs]def merge_lines(lines, start, join=" "):
"""Gets a single continuous string from a sequence of lines.
:param list lines: The lines to merge.
:param int start: The start point in each record.
:param str join: The string to join on.
:rtype: ``str``"""
string = join.join([line[start:].strip() for line in lines])
return string
[docs]def structure_to_pdb_string(structure):
"""Converts a :py:class:`.AtomStructure` to a .pdb filestring.
:param AtomStructure structure: the structure to convert.
:rtype: ``str``"""
lines = []
pack_sequences(structure, lines)
atoms = sorted(structure.atoms(), key=lambda a: a.id)
for i, atom in enumerate(atoms):
atom_to_atom_line(atom, lines)
if isinstance(atom.het, Residue) and (
atom is atoms[-1] or atoms[i + 1].chain is not atom.chain or
isinstance(atoms[i + 1].het, Ligand)):
last = lines[-1]
lines.append(f"TER {last[6:11]} {last[17:20]} {last[21]}{last[22:26]}{last[26]}")
return "\n".join(lines)
[docs]def pack_sequences(structure, lines):
"""Adds SEQRES lines from polymer sequence data.
:param AtomStructure structure: the structure to convert.
:param list lines: the string lines to update."""
try:
for chain in sorted(structure.chains(), key=lambda c: c.id):
residues = valerius.from_string(chain.sequence).codes
length = len(residues)
line_count = ceil(length / 13)
for line_num in range(line_count):
lines += ["SEQRES {:>3} {} {:>4} {}".format(
line_num + 1, chain.id, length,
" ".join(residues[line_num * 13: (line_num + 1) * 13])
)]
except AttributeError: pass
[docs]def atom_to_atom_line(a, lines):
"""Converts an :py:class:`.Atom` to an ATOM or HETATM record. ANISOU lines
will also be added where appropriate.
:param Atom a: The Atom to pack.
:param list lines: the string lines to update."""
line = "{:6}{:5} {:4} {:3} {:1}{:4}{:1} "
line += "{:>8}{:>8}{:>8} 1.00{:6} {:>2}{:2}"
id_, residue_name, chain_id, residue_id, insert_code = "", "", "", "", ""
if a.het:
id_, residue_name = a.het.id, a.het._name
chain_id = a.chain.id if a.chain is not None else ""
residue_id = int("".join([c for c in id_ if c.isdigit() or c == "-"]))
insert_code = id_[-1] if id_ and id_[-1].isalpha() else ""
atom_name = a._name or ""
atom_name = " " + atom_name if len(atom_name) < 4 else atom_name
occupancy = " 1.00"
line = line.format(
"HETATM" if isinstance(a.het, Ligand) or a._is_hetatm else "ATOM",
a.id, atom_name, residue_name, chain_id, residue_id, insert_code,
"{:.3f}".format(a.location[0]) if a.location[0] is not None else "",
"{:.3f}".format(a.location[1]) if a.location[1] is not None else "",
"{:.3f}".format(a.location[2]) if a.location[2] is not None else "",
"{:.2f}".format(a.bvalue).strip().rjust(6) if a.bvalue is not None else "",
a.element or "", str(int(a.charge))[::-1] if a.charge else "",
)
lines.append(line)
if a.anisotropy != [0, 0, 0, 0, 0, 0]:
lines.append(atom_to_anisou_line(a, atom_name,
residue_name, chain_id, residue_id, insert_code))
[docs]def atom_to_anisou_line(a, name, res_name, chain_id, res_id, insert):
"""Converts an :py:class:`.Atom` to an ANISOU record.
:param Atom a: The Atom to pack.
:param str name: The atom name to use.
:param str res_name: The residue name to use.
:param str chain_id: The chain ID to use.
:param str res_id: The residue ID to use.
:param str insert: The residue insert code to use.
:rtype: ``str``"""
line = "ANISOU{:5} {:4} {:3} {:1}{:4}{:1} "
line += "{:>7}{:>7}{:>7}{:>7}{:>7}{:>7} {:>2}{:2}"
anisotropy = [round(x * 10000 )for x in a.anisotropy]
line = line.format(
a.id, name, res_name, chain_id, res_id, insert,
anisotropy[0], anisotropy[1], anisotropy[2],
anisotropy[3], anisotropy[4], anisotropy[5],
a.element if a.element else "",
str(int(a.charge))[::-1] if a.charge else "",
)
return line