import random, string
from rdkit import Chem
import numpy as np
import pandas as pd
import glob, os
from openbabel import openbabel
[docs]
def get_bonded_atoms(xyz_string, atom_index, search_this_atomic_num=None):
"""Use openbabel's methods to find the coordinates of all atoms that are bonded to a given atom
:param XYZ file in string format:
:param atom_index:
:return: numpy array of atoms bonded to a given atom
"""
# initalize openbabel classes
obconversion = openbabel.OBConversion()
obconversion.SetInFormat("xyz")
mol = openbabel.OBMol()
obconversion.ReadString(mol, xyz_string)
# make atom object for atom we want
atom = mol.GetAtom(atom_index + 1) # for obmol get functions indexing starts from 1
index_list = []
for neighbour_atom in openbabel.OBAtomAtomIter(atom):
if search_this_atomic_num is not None:
if neighbour_atom.GetAtomicNum() != search_this_atomic_num:
continue
atomic_num = neighbour_atom.GetAtomicNum()
index = neighbour_atom.GetIdx()
index_list.append([atomic_num, index])
return index_list
[docs]
def calculate_dihedral(coordinates_1, coordinates_2, coordinates_3, coordinates_4):
"""Praxeolitic formula
1 sqrt, 1 cross product source: https://stackoverflow.com/a/34245697"""
p0 = coordinates_1
p1 = coordinates_2
p2 = coordinates_3
p3 = coordinates_4
b0 = -1.0 * (p1 - p0)
b1 = p2 - p1
b2 = p3 - p2
# normalize b1 so that it does not influence magnitude of vector
# rejections that come next
b1 /= np.linalg.norm(b1)
# vector rejections
# v = projection of b0 onto plane perpendicular to b1
# = b0 minus component that aligns with b1
# w = projection of b2 onto plane perpendicular to b1
# = b2 minus component that aligns with b1
v = b0 - np.dot(b0, b1) * b1
w = b2 - np.dot(b2, b1) * b1
# angle between v and w in a plane is the torsion angle
# v and w may not be normalized but that's fine since tan is y/x
x = np.dot(v, w)
y = np.dot(np.cross(b1, v), w)
return np.degrees(np.arctan2(y, x))
[docs]
def calculate_distance(a, b):
"""
This function calculates the distance between two points in 3D space.
Args:
a : array of floats
b : array of floats
Return:
distance : float
"""
distance = np.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2)
return distance
[docs]
def add_code_to_structure():
N = 10
code_name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N))
return code_name
[docs]
def change_second_line_xyz(xyz, new_content=''):
with open('{}'.format(xyz), 'r', newline='\n') as file:
# read a list of lines into data
data = file.readlines()
# now change the 2nd line, note that you have to add a newline
if new_content == '':
data[1] = '\n'
else:
data[1] = '{}\n'.format(new_content)
# and write everything back
with open('{}'.format(xyz), 'w') as file:
file.writelines(data)
[docs]
def get_ligands_from_smiles(ligands_SMILES):
ligand_mol = ['']*len(ligands_SMILES)
for index, ligand in enumerate(ligands_SMILES):
ligands_SMILES[index] = Chem.MolFromSmiles(ligand)
return ligand_mol
[docs]
def dataframe_from_dictionary(dictionary):
Dataframe = pd.DataFrame.from_dict({i: dictionary[i]
for i in dictionary.keys()}, orient='index')
return Dataframe
[docs]
def get_cluster_centroid_coord(n_clusters, unique_labels, dataframe):
"""
This function identifies the centroids of the clusters generated by the UMAP
algorithm.
Args:
n_clusters : integer
unique_labels : array of integers
dataframe : pandas dataframe containing UMAP1, UMAP2
Return:
centroids : array of float sized (2, n_clusters),
where 2 is (x, y) coordinates
"""
# initialize cluster centroids array
centroids = np.zeros((2, n_clusters))
# fill the array with the coords
for j, u_label in enumerate(unique_labels):
UMAP1 = dataframe[(dataframe['label'] == u_label)].mean()['UMAP-1']
UMAP2 = dataframe[(dataframe['label'] == u_label)].mean()['UMAP-2']
temp_coord_array = np.array([UMAP1, UMAP2])
centroids[1][j] = temp_coord_array[0]
centroids[0][j] = temp_coord_array[1]
return centroids
[docs]
def find_bonds_with_neighbours(mol, center_atom_atomic_number):
center_atom = [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == center_atom_atomic_number][0]
bonds = [bond for bond in center_atom.GetBonds()]
return bonds
[docs]
def check_if_at_least_two_mapped_atoms_in_ring(list_of_mapped_idxs, list_of_ring_idxs):
return len(set(list_of_mapped_idxs) & set(list_of_ring_idxs)) >= 2
[docs]
def find_bidentate(xyz):
with open(xyz) as file:
listfile = []
for line in file:
listfile.append(line.strip())
indices = [0]*3
atoms = []
for index, atom in enumerate(listfile):
if atom[0] == 'P':
if not indices[0]:
indices[0] = index - 1
else:
indices[2] = index - 1
atoms.append(atom[0])
else:
if atom[:2] == 'Ir':
indices[1] = index - 1
atoms.append(atom[:2])
return indices
[docs]
def xyz_to_gjf(header, io_path):
if not os.path.exists(io_path):
os.mkdir(io_path)
os.chdir(io_path)
xyzs = glob.glob('*.xyz')
for xyz in xyzs:
with open(xyz) as file:
f = file.readlines()
print(len(f))
for new_line in reversed(header):
f.insert(0, new_line)
f.pop(len(header))
f.pop(len(header))
if f[-1] != '\n':
f.append('\n')
file.close()
with open(f'{xyz[:-4]}.gjf', 'w+') as file:
file.writelines(f)
file.close()
[docs]
def gjf_to_xyz(path, header):
for gjf in glob.glob(os.path.join(path, "*.gjf")):
with open(gjf) as file:
f = file.readlines()
# count = len(f) - len(header)
# print(count, gjf)
f = f[len(header):]
# print(f)
count = len(f) - 2
f.insert(0, '\n')
f.insert(0, str(count))
file.close()
with open(f'{gjf[:-4]}.xyz', 'w+') as file:
file.writelines(f)
file.close()
if __name__ == "__main__":
# xyz_to_gjf(header=header,
# io_path='data/Rh_dataset/ferrocenes/other')
# gjf_to_xyz(path='data/Rh_dataset/ferrocenes/gaussian_input_files/gjf', header=header)
xyz_to_gjf(header=header, io_path='../')