% LaTeX support: latex@mdpi.com 
% In case you need support, please attach all files that are necessary for compiling as well as the log file, and specify the details of your LaTeX setup (which operating system and LaTeX version / tools you are using).

%=================================================================
\documentclass[preprints,article,accept,moreauthors,pdftex]{Definitions/mdpi} 
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage[affil-it]{authblk}
\usepackage{caption}
\usepackage{color}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{float}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{gensymb}
\usepackage{microtype}
\usepackage{lineno,hyperref}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{CJK}
\usepackage{units}
\usepackage{longtable}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{color}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{subscript}
\usepackage{comment}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage[affil-it]{authblk}
\usepackage{caption}
\usepackage{color}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{float}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{gensymb}
\usepackage{microtype}
\usepackage{lineno,hyperref}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{CJK}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{CJK}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{color}
\usepackage{colortbl}
\usepackage{subscript}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{seqsplit}
\usepackage{upgreek} 
\usepackage{geometry} 
\usepackage{caption}
\usepackage{color}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{float}
\usepackage{seqsplit}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{gensymb}
\usepackage{microtype}
\usepackage{lineno,hyperref}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{units}
\usepackage{longtable}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{color}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{subscript}
\usepackage[affil-it]{authblk}
\usepackage{caption}
\usepackage{color}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{float}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{gensymb}
\usepackage{url}
\usepackage{microtype}
\usepackage{lineno,hyperref}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{CJK}
\usepackage{units}
\usepackage{longtable}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{colortbl}
\usepackage{subscript}
\usepackage{comment}
\usepackage{color}
\usepackage{dcolumn}
\usepackage{bm}
\usepackage{float}
\usepackage{seqsplit}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{gensymb}
\usepackage{microtype}
\usepackage{lineno,hyperref}
\usepackage{chngcntr}
\usepackage[english]{babel}
\usepackage{graphicx}
\usepackage{units}
\usepackage{longtable}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{color}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{subscript}

\usepackage{units}
\usepackage{longtable}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{upgreek}
\usepackage{textgreek}
\usepackage{multirow}
\usepackage{color}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{subscript}
%\usepackage{enumerate}
\newcommand{\angstrom}{\AA}
\newcommand{\pka}{pK{\small a}}
\newcommand{\pKa}{pK{\small a}}
\newcommand{\hiso}{\textsuperscript{1}H}
\newcommand{\htwo}{\textsuperscript{2}H}
\newcommand{\niso}{\textsuperscript{15}N}
\newcommand{\ciso}{\textsuperscript{13}C}
\newcommand{\piso}{\textsuperscript{31}P}
\newcommand{\calcium}{Ca\textsuperscript{2+}}
\newcommand{\Calcium}{Ca\textsuperscript{2+}}
\newcommand{\sodium}{Na\textsuperscript{+}}
\newcommand{\potasium}{K\textsuperscript{+}}
\newcommand{\chloride}{Cl\textsuperscript{-}}
\newcommand{\hydrogen}{H\textsuperscript{+}}
\newcommand{\CaV}{Ca\textsubscript{V}1.2}
\newcommand{\CaVone}{Ca\textsubscript{V}1.1}
\newcommand{\micro}{µ}
\newcommand{\schro}{Schr{\"o}dinger}

\definecolor{gray}{rgb}{0.66, 0.66, 0.66}
\hypersetup{colorlinks = true, citecolor = blue, urlcolor = blue}
\definecolor{background-color}{gray}{0.98}


% If you would like to post an early version of this manuscript as a preprint, you may use preprint as the journal and change 'submit' to 'accept'. The document class line would be, e.g., \documentclass[preprints,article,accept,moreauthors,pdftex]{mdpi}. This is especially recommended for submission to arXiv, where line numbers should be removed before posting. For preprints.org, the editorial staff will make this change immediately prior to posting.

%--------------------
% Class Options:
%--------------------
%----------
% journal
%----------
% Choose between the following MDPI journals:
% acoustics, actuators, addictions, admsci, aerospace, agriculture, agriengineering, agronomy, algorithms, animals, antibiotics, antibodies, antioxidants, applsci, arts, asc, asi, atmosphere, atoms, axioms, batteries, bdcc, behavsci , beverages, bioengineering, biology, biomedicines, biomimetics, biomolecules, biosensors, brainsci , buildings, cancers, carbon , catalysts, cells, ceramics, challenges, chemengineering, chemistry, chemosensors, children, cleantechnol, climate, clockssleep, cmd, coatings, colloids, computation, computers, condensedmatter, cosmetics, cryptography, crystals, dairy, data, dentistry, designs , diagnostics, diseases, diversity, drones, econometrics, economies, education, ejihpe, electrochem, electronics, energies, entropy, environments, epigenomes, est, fermentation, fibers, fire, fishes, fluids, foods, forecasting, forests, fractalfract, futureinternet, futurephys, galaxies, games, gastrointestdisord, gels, genealogy, genes, geohazards, geosciences, geriatrics, hazardousmatters, healthcare, heritage, highthroughput, horticulturae, humanities, hydrology, ijerph, ijfs, ijgi, ijms, ijns, ijtpp, informatics, information, infrastructures, inorganics, insects, instruments, inventions, iot, j, jcdd, jcm, jcp, jcs, jdb, jfb, jfmk, jimaging, jintelligence, jlpea, jmmp, jmse, jnt, jof, joitmc, jpm, jrfm, jsan, land, languages, laws, life, literature, logistics, lubricants, machines, magnetochemistry, make, marinedrugs, materials, mathematics, mca, medicina, medicines, medsci, membranes, metabolites, metals, microarrays, micromachines, microorganisms, minerals, modelling, molbank, molecules, mps, mti, nanomaterials, ncrna, neuroglia, nitrogen, notspecified, nutrients, ohbm, optics, particles, pathogens, pharmaceuticals, pharmaceutics, pharmacy, philosophies, photonics, physics, plants, plasma, polymers, polysaccharides, preprints , proceedings, processes, proteomes, psych, publications, quantumrep, quaternary, qubs, reactions, recycling, religions, remotesensing, reports, resources, risks, robotics, safety, sci, scipharm, sensors, separations, sexes, signals, sinusitis, smartcities, sna, societies, socsci, soilsystems, sports, standards, stats, surfaces, surgeries, sustainability, symmetry, systems, technologies, test, toxics, toxins, tropicalmed, universe, urbansci, vaccines, vehicles, vetsci, vibration, viruses, vision, water, wem, wevj

%---------
% article
%---------
% The default type of manuscript is "article", but can be replaced by: 
% abstract, addendum, article, benchmark, book, bookreview, briefreport, casereport, changes, comment, commentary, communication, conceptpaper, conferenceproceedings, correction, conferencereport, expressionofconcern, extendedabstract, meetingreport, creative, datadescriptor, discussion, editorial, essay, erratum, hypothesis, interestingimages, letter, meetingreport, newbookreceived, obituary, opinion, projectreport, reply, retraction, review, perspective, protocol, shortnote, supfile, technicalnote, viewpoint
% supfile = supplementary materials

%----------
% submit
%----------
% The class option "submit" will be changed to "accept" by the Editorial Office when the paper is accepted. This will only make changes to the frontpage (e.g., the logo of the journal will get visible), the headings, and the copyright information. Also, line numbering will be removed. Journal info and pagination for accepted papers will also be assigned by the Editorial Office.

%------------------
% moreauthors
%------------------
% If there is only one author the class option oneauthor should be used. Otherwise use the class option moreauthors.

%---------
% pdftex
%---------
% The option pdftex is for use with pdfLaTeX. If eps figures are used, remove the option pdftex and use LaTeX and dvi2pdf.

%=================================================================
\firstpage{1} 
\makeatletter 
\setcounter{page}{\@firstpage} 
\makeatother
\pubvolume{xx}
\issuenum{1}
\articlenumber{5}
\pubyear{2023}
\copyrightyear{2023}
%\externaleditor{Academic Editor: name}
\history{Received: date; Accepted: date; Published: date}
%\updates{yes} % If there is an update available, un-comment this line

%% MDPI internal command: uncomment if new journal that already uses continuous page numbers 
%\continuouspages{yes}

%------------------------------------------------------------------
% The following line should be uncommented if the LaTeX file is uploaded to arXiv.org
%\pdfoutput=1

%=================================================================
% Add packages and commands here. The following packages are loaded in our class file: fontenc, calc, indentfirst, fancyhdr, graphicx, lastpage, ifthen, lineno, float, amsmath, setspace, enumitem, mathpazo, booktabs, titlesec, etoolbox, amsthm, hyphenat, natbib, hyperref, footmisc, geometry, caption, url, mdframed, tabto, soul, multirow, microtype, tikz

%=================================================================
%% Please use the following mathematics environments: Theorem, Lemma, Corollary, Proposition, Characterization, Property, Problem, Example, ExamplesandDefinitions, Hypothesis, Remark, Definition, Notation, Assumption
%% For proofs, please use the proof environment (the amsthm package is loaded by the MDPI class).

%=================================================================
% Full title of the paper (Capitalized)
%\Title{}
\Title{Towards a truly general intermolecular binding affinity calculator for drug discovery \& design}
% Author Orchid ID: enter ID or remove command
\newcommand{\orcidauthorA}{0000-0001-6060-7937} % Add \orcidA{} behind the author's name
%\newcommand{\orcidauthorB}{0000-0000-000-000X} % Add \orcidB{} behind the author's name

% Authors, for the paper (add full first names)
\Author{Wei Li\orcidA{}*$^{1}$, Gary G. Vottevor $^{1}$}

% Authors, for metadata in PDF
\AuthorNames{Wei Li, Gary G. Vottevor $^{1}$}

% Affiliations / Addresses (Add [1] after \address if there is only one affiliation.)
\address{%
$^{1}$ \quad Contrebola Institute of Computational Interstructural Biophysics, 
No. 88, Fuxing East Road, Nantong City 226000, 
Jiangsu Province, P. R. China}

% Contact information of the corresponding author
\corres{Correspondence: wli148@aucklanduni.ac.nz}

% Current address and/or shared authorship
%\firstnote{Current address: Affiliation 3} 
%\secondnote{These authors contributed equally to this work.}
% The commands \thirdnote{} till \eighthnote{} are available for further notes

%\simplesumm{} % Simple summary

%\conference{} % An extended version of a conference paper

% Abstract (Do not insert blank lines, i.e. \\) 
\abstract{
Intermolecular interactions are the fabrics underlying almost all processes in living organisms, where two cornerstone concepts, intermolecular binding affinity (K\textsubscript{d}) and binding energy (\textDelta G), have long been established to physically describe the strengths of biomolecular interactions, e.g., drug-target K\textsubscript{d} and \textDelta G to describe the strength of drug-target interaction.
The past two-three years saw a big step forward in the use of artificial intelligence (AI) in structural biology (e.g., AlphaFold for protein structure prediction) and drug discovery \& design.
In light of the roles of K\textsubscript{d} and \textDelta G in drug discovery \& design, the speed of this AI progress raises a question of what's next for its practical application in the pharmaceutical industry, in addition to a system-wide account of biomolecular structures and motions.
Last August, the concept of a general intermolecular binding affinity calculator (GIBAC) was for the first time coined and proposed in an MDPI-published preprint.
Here, this article puts forward an updated conceptual and practical framework of GIBAC, including its inception, definition, construction, practical applications, technical challenges and limitations, and future directions.
Moreover, this article argues that the time is now ripe for the construction of such an accurate, precise and efficient GIBAC to be on the agenda of the entire drug discovery \& design community, to ensure its applicability \& reliability, and to enhance its value in drug R\&D in future.
}

% Keywords
\keyword{GIBAC; Biophysics; Structural Biology; Drug discovery \& design; Artificial intelligence-integrated drug discovery}
\clearpage
\clearpage
% The fields PACS, MSC, and JEL may be left empty or commented out if not applicable
%\PACS{J0101}
%\MSC{}
%\JEL{}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Only for the journal Diversity
%\LSID{\url{http://}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Only for the journal Applied Sciences:
%\featuredapplication{Authors are encouraged to provide a concise description of the specific application or a potential application of the work. This section is not mandatory.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Only for the journal Data:
%\dataset{DOI number or link to the deposited data set in cases where the data set is published or set to be published separately. If the data set is submitted and will be published as a supplement to this paper in the journal Data, this field will be filled by the editors of the journal. In this case, please make sure to submit the data set as a supplement when entering your manuscript into our manuscript editorial system.}

%\datasetlicense{license under which the data set is made available (CC0, CC-BY, CC-BY-SA, CC-BY-NC, etc.)}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Only for the journal Toxins
%\keycontribution{The breakthroughs or highlights of the manuscript. Authors can write one or two sentences to describe the most important part of the paper.}

%\setcounter{secnumdepth}{4}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section*{Introduction}
On August 11, 2022, the concept of a general intermolecular binding affinity calculator (GIBAC) was for the first time coined and proposed in an MDPI-published preprint (https://www.preprints.org/manuscript/202208.0213/v1) \cite{Li2022GIBAC}, of which this article puts forward an update, including GIBAC's inception, definition, construction, practical applications, technical challenges and limitations, and future directions.
In addition, this article discusses how we can combine the insights of structural biology, biophysics and artificial intelligence to build an accurate, precise and efficient GIBAC, and to navigate the next frontier for its practical application in drug discovery \& design.




\section*{Inception of GIBAC: the origin of the idea of it}
\subsection*{Intermolecular binding affinity is key to how genetic variation affects drug discover \& design}
Genetic variation is a fundamental aspect of biology \cite{Cheng2023missense,Ellegren2016,Lynch2016}, which occurs not just in infectious diseases \cite{Wong2023science,Murray2022AMR}, but also in non-communicable diseases, e.g., cancer \cite{FonsecaMontao2022}, cardiovascular \cite{Blaich2012,Seisenberger2000} and rare diseases \cite{Hua2011,Hua2015,Li2017SMA}.
Take SARS-CoV-2 for instance \cite{Hu2020ZhengLiShi,Li2020covid19}.
The emergence of SARS-CoV-2 variants has had implications for the discovery and design of drugs (small molecules and antibodies) and vaccines. 
As revealed by genomic surveillance, SARS-CoV-2 has undergone genetic mutations that alter its spike (S) protein, the primary target for discovery and design of neutralizing antibodies and vaccines against SARS-CoV-2 \cite{Dong_2020,Lippi2020}.
In the meanwhile, these mutations could lead to changes in the virus's antigenic properties (Figure~\ref{fig:2}), conferring resistance to specific monoclonal antibodies, and reducing the effectiveness of existing antibodies and vaccines \cite{Chen2022variants,Li2020abdesign}. 
In addition to antibodies and vaccines, small molecules inhibiting spike-ACE2 protein-protein interactions (PPI) have also been the focus of a variety of R\&D efforts to develop potential blockers of viral attachment and entry for SARS-CoV-2 into host cells \cite{Bojadzic2021,Chuang2022}.
With this respect, SARS-CoV-2 variants carry genetic changes in critical viral proteins (e.g., the main protease) \cite{Singh2021,Liu2007Spike,Tao2021}, which alter their binding affinity to small molecule inhibitors.

In infectious diseases like COVID-19, therefore, genetic changes of the pathogen(s) lead to structural and functional consequences \cite{Li2018booksma,me_rTPA_JCBC,Li_2020furin,Li_2021Semaglutide}, which in turn affect the efficacy of drugs and vaccines against infectious agents such as SARS-CoV-2 \cite{Shu2017GISAID}, calling for an intermolecular K\textsubscript{d} calculator to be able to accommodate genetic variations, as described in Equation~\ref{eqn1}, where \underline{$molA$}, \underline{$molB$}, and \underline{$molC$} represent S protein, ACE2 on the host cell's membrane, antibody or small molecule inhibitors against SARS-CoV-2, respectively, while $mutation$ represents genetic mutation(s) of SARS-CoV-2's S protein, $envPara$ represents K\textsubscript{d}-related enviromental parameters such as temperature, pH \cite{Li2022GIBAC}. 

\begin{equation} 
\label{eqn1}
K\textsubscript{d} = f(molA, molB, molC, envPara, mutation(s)) 
\end{equation}

Of further interest, an intermolecular K\textsubscript{d} calculator as described in Equation~\ref{eqn1} can strengthen our structural and functional understanding of those genetic variations, which can in turn aid the design of multiple-sites targeting therapeutics with enhanced binding affinities, towards the discovery and development of broad-spectrum inhibitors against infectious pathogens' (e.g., SARS-CoV-2) variants in future \cite{Abdolmaleki2018,Burki_2023,Brito_2022,Giovanetti2021}.



\begin{figure*}[h]
\centering
\includegraphics[width=0.6\textwidth]{linearflowchart.png}
\caption{Genetic variation-initiated central dogma-like flowchart mechanism through which genetic variation exerts its impact on drug discovery \& design, in case $\texttt{protein function} = \texttt{drug target}$ here.}
\label{fig:2}
\end{figure*}




Similarly, in non-communicable diseases, genetic variations of drug target protein structure(s) are also linked to the discovery \& design of therapeutics, including antibodies, small molecule inhibitors and vaccines.
For instance, the emergence of resistance to antibody therapies targeting proteins such as EGFR, PD-1 or PD-L1 \cite{Li2018PD1} has been a significant challenge in cancer treatment, as mutations in tumor cells impair the binding of antibodies (or small molecule inhibitors) to these proteins (Figure~\ref{fig:2}), leading to the development of resistance to therapeutics of a variety of cancers, thereby diminishing their therapeutic efficacy \cite{Sharma2017,Banerjee2017}.
Thus, understanding the mechanisms underlying mutation-induced resistance are crucial for the development of effective strategies to overcome this challenge in cancer immunotherapy \cite{Yun2007emergence,Gettinger2017Cancer,Zaretsky2016}, calling for an intermolecular K\textsubscript{d} calculator (as described in Equation~\ref{eqn1}) to be able to accommodate genetic variations \cite{Li2022GIBAC}, where \underline{$molA$}, \underline{$molB$}, and \underline{$molC$} represent PD-1, PD-L1 and antibody or small molecule inhibitor against cancer cell's immune escape \cite{Li2020catmouse,Beatty2014}, \underline{$mutation$} represents genetic mutation(s) of PD-1 and/or PD-L1, \underline{$envPara$} represents K\textsubscript{d}-related environmental parameters, e.g., temperature, pH \cite{Li2022GIBAC}. 

Taken together, therefore, it is necessary to build a \underline{general} intermolecular binding affinity calculator (GIBAC \cite{Li2022GIBAC}), which is able to accomodate genetic variation(s), towards efficient \underline{design-make-test} cycles of therapeutic candidates for the treatments of both infectious and non-communicable diseases \cite{Wong2018MIT}.



\subsection*{Towards a truly general intermolecular K\textsubscript{d} calculator}
Intermolecular binding affinity, e.g., drug-target binding affinity (K\textsubscript{d}), is inextricably linked to the mechanism through which genetic variation exerts its impact (Figure~\ref{fig:2}) on drug discovery \& design.
Yet, the degree to which genetic variations occur is much lower than that of the complexity of (bio)molecules, which includes a variety of factors such as alternative splicing \cite{Hua2011,Hua2015}, post-translational modifications (PTMs), post-expression modifications (PEMs), chemical and biological space \cite{Lipinski2004nature,Roggia2023,RecursionPharmaceuticals}, three-dimensional structure \cite{History1971,Berman2003Announcing,Li2021PDB}, structural folding and conformational dynamics \cite{Wang2023conformationalspace,Shulman1970}, intermolecular interactions, induced-fit binding \cite{Csermely2010,Teague_2003}, conformational transition and dynamics of intrinsically disordered proteins \cite{Ruan2021IDP,Wang2023IDP,Liu2023IDP}.
To achieve accurate calculation of intermolecular K\textsubscript{d}, biomolecular structural information of the interacting partners is indispensable but not always available for physics-based intermolecular K\textsubscript{d} calculators such as Prodigy and BindProfX \cite{Vangone2015,Xue2016,Xiong_2017}.


To date, currently available approaches for the calculation of intermolecular K\textsubscript{d} include three approaches, i.e., physics-, statistics- and artificial intelligence (AI)-based approaches, all of which are with advantages and disadvantages.
For instance, physics-based models \cite{Huang_2006mechanics,Cavasotto_2020} are as accurate as its physical representation of the underlying principles governing intermolecular interactions, while the complexity of the physical equations make the calculations computationally expensive and time-consuming; 
Statistics-based models can capture complex relationships between molecular features and binding affinities, and does not require detailed knowledge of the underlying physics. Yet, its performance is highly dependent on the quality and the quantity of available data, with limited accuracy in case statistical approaches do not capture the full complexity of the intermolecular interactions; 
Lastly, AI-based approaches \cite{Soni_2020,Ballester_2010} can learn complex patterns and correlations from large datasets, and capture non-linear relationships between molecular features and binding affinities. 
However, AI approaches require extensive training data, the accuracy of AI models is also highly dependent on the quality and quantity of the training data \cite{Berman2003Announcing,Li2020EUT,Evans_2021}.
To ensure accuracy and precision of intermolecular K\textsubscript{d} calculation, therefore, a combined hybrid approach of AI and physics is herein described and discussed below in the section of \underline{\textbf{Construction of GIBAC}}.

Taken together, none of the three approaches (as of \today) is able to collectively meet the standards (listed below) of a GIBAC with adequate accuracy, precision, efficiency and practical applications in drug discovery \& design.
This article therefore puts forward a conceptual and practical framework to build such a GIBAC, and to ensure its applicability, reliability and efficiency, and to enhance its value in the pharmaceutical industry \cite{Wong2018MIT}.



\begin{enumerate}
\item A truly general intermolecular K\textsubscript{d} calculator needs to take into account genetic variations, as described in Equation~\ref{eqn1});
\item Structural information of the interacting partners is indispensable but not always available for physics-based calculation of intermolecular K\textsubscript{d} such as Prodigy \cite{Cramer2021,Read2023}, making it fall short of a truly general intermolecular K\textsubscript{d} calculator;
\item A variety of intermolecular K\textsubscript{d}-relevant factors need to be taken into account, such as temperature, biomolecular structural dynamics, pH \cite{Platzer2014,pH,enzyme}, site-specific protonation states (e.g., side chain \pka ~of protein) \cite{Xue2016,Li2017Gravity,AKL,hansen2014measurement}, PTMs, PEMs \cite{Li_2021Semaglutide,Weiss2013insu,Nuhoho_2019,Bucheit_2020}, ionic strength, buffer conditions \cite{Xue2016,S_ndergaard_2011,Olsson_2011};
\item A truly general intermolecular K\textsubscript{d} calculator requires a general forcefield for all types of atoms \cite{Hofmann2023};
\item A truly general intermolecular K\textsubscript{d} calculator requires a universal linear string/graph-based notation system for accurate and flexible description and representation of all molecular types and drug modalities \cite{Wang2023conformationalspace,Mller2021}.
\item A truly general intermolecular K\textsubscript{d} calculator is able to be used the other way around in drug discovery \& design, i.e., to be used as a search engine for drug candidate(s). With the search engine, a list of therapeutic candidates can be retrieved and ranked according to drug-target K\textsubscript{d} values, with input parameters including a drug target and a desired drug-target K\textsubscript{d} (or a range of K\textsubscript{d} values).
\end{enumerate}


\section*{A biophysical and structural definition of GIBAC}
\subsection*{The biophysics underlying intermolecular binding affinity}
By definition, intermolecular binding affinity is the physical strength of the binding between a single molecule to its partner, and is typically expressed as the equilibrium dissociation constant (K\textsubscript{d}) \cite{Vangone2015,Kastritis2014}: 


\begin{equation} 
\label{eqn2}
K\textsubscript{d} = e^{\frac{-\Delta G}{RT}}
\end{equation}




where K\textsubscript{d} is the binding affinity (Molar, M), $\Delta$G is the binding free energy (kcal mol\textsuperscript{-1}), $R$ is the ideal gas constant (1.987 $\times$ 10\textsuperscript{-3} kcal Kelvin\textsuperscript{-1} mol\textsuperscript{-1}), $T$ is the temperature in Kelvin, $ln$ denotes the natural logarithm, $e$ denotes the base of $ln$. 
For K\textsubscript{d} in Equation~\ref{eqn2}, the smaller the value of K\textsubscript{d}, the greater the drug-target binding affinity; 
The larger the value of K\textsubscript{d}, the weaker the drug-target binding affinity \cite{ztrk2018,Wang_2021DeepDTA}. 



\begin{equation} 
\label{eqn3}
\Delta G = -R T ln(K\textsubscript{d})
\end{equation}


For $\Delta$G in Equation~\ref{eqn3} \cite{Reif2021}, the structural biophysics of a molecular system is governed by a delicate balance between attractive and repulsive forces, where $\Delta G$ is defined as the difference in the free energy between the bound state and the unbound state of the molecules \cite{ElkiwiKerdawykiwi2012}.
Specifically, in Equation~\ref{eqn3}, 

\begin{enumerate}
\item a negative value of \textDelta G (K\textsubscript{d} $>$ 1 M) indicates energetically favorable binding, i.e., an overall attractive force between the two molecules;
\item a positive value of \textDelta G (K\textsubscript{d} $<$ 1 M) indicates energetically unfavorable binding, i.e., an overall repulsive force between the two molecules;
\item a zero value of \textDelta G (K\textsubscript{d} = 1 M) indicates that no binding or interaction exist between the two molecules, i.e., no energy is required to seperate the two molecules or to keep the two molecules bound.
\end{enumerate}


In practice, K\textsubscript{d} and \textDelta G are influenced by a variety of factors, including non-covalent intermolecular interactions such as electrostatics (e.g., hydrogen bond or salt bridge) \cite{Li2018booksma,me_rTPA_JCBC,Li_2020furin}, hydrophobics and Van der Waals (VdW, Figure~\ref{fig:88833}) forces \cite{Bitencourt_Ferreira_2021,Fahmy_2011,umeyama1977origin}, and environmental parameters (abbreviated as \underline{$envPara$} here) such as pH (e.g., protonation states of ionizable residues' side chains \cite{Li_2017NMR,Webb_2010NMR,Hansen_2014}), ionic strength and temperature (e.g., in molecular dynamics) \cite{S_ndergaard_2011,Olsson_2011}.
Furthermore, K\textsubscript{d} and \textDelta G may also be affected by the presence of additional molecules, where there are multiple interacting molecules in the system \cite{Votavova_2014}.


In light of the fact that drug-target K\textsubscript{d} is an essential parameter in drug discovery \& design \cite{Li_2021Semaglutide,Rifaioglu_2020,D_Souza_2020}, a series of computational tools have already been developed to calculate drug-target K\textsubscript{d}, including molecular mechanics-based calculations \cite{Malone_2022,Fuji2017} and machine-learning based predictions \cite{CarracedoReboredo2021,Soni_2020,Ballester_2010,BitencourtFerreira2019}.
Last August, the concept of GIBAC was for the first time coined and proposed in an MDPI preprint \cite{Li2022GIBAC}.
Here, this article puts forward an update of it, i.e., a \underline{truly general intermolecular K\textsubscript{d} calculator}:


\begin{equation} 
\label{eqn4}
K\textsubscript{d} = f(molecules, envPara)
\end{equation}


where \underline{$molecules$} represents the molecular system with the number of interacting partners being \underline{$X$} (two, three, or more), and the molecular system is to be described in strings, e.g., amino acid sequences, strings of letters for proteins, or strings of SMILES (Simplified Molecular Input Line Entry System) to represent the chemical structure of small molecules \cite{VasselliIL2,Torreskiwi1995,Lee1994,RahuelClermont1997}, or graphs to describe glycosylated proteins for instance, and \underline{$envPara$} represents environmental parameters \cite{Xue2016,Li2017Gravity,AKL,hansen2014measurement}.

For instance, in case \underline{$X = 3$} (i.e., in Equation~\ref{eqn1}), one example of GIBAC is a molecular system of \underline{PD-1 antibody}, PD-1 and PD-L1, where \underline{$molA$}, \underline{$molB$} and \underline{$molC$} represent PD-1, PD-L1 and PD-1 antibody (Keytruda for instance \cite{Robert2015keytruda}), respectively.
For a GIBAC as described in Equation~\ref{eqn1}, a \underline{traversal} intermolecular K\textsubscript{d} means the calculations of a list of intermolecular K\textsubscript{d} values: 

\begin{enumerate}
\item K\textsubscript{d1} (\textDelta G\textsubscript{1}) between \underline{$molA$} and \underline{$molB$}.
\item K\textsubscript{d2} (\textDelta G\textsubscript{2})  between \underline{$molA$} and \underline{$molC$}.
\item K\textsubscript{d3} (\textDelta G\textsubscript{3})  between \underline{$molC$} and \underline{$molB$}.
\item K\textsubscript{d4} (\textDelta G\textsubscript{4})  between \underline{$molA+B$} and \underline{$molC$}.
\item K\textsubscript{d5} (\textDelta G\textsubscript{5})  between \underline{$molA+C$} and \underline{$molB$}.
\item K\textsubscript{d6} (\textDelta G\textsubscript{6})  between \underline{$molB+C$} and \underline{$molA$}.


\end{enumerate}


With the discovery \& design of PD-1 antibody \cite{Li2018PD1,Pauken2015} as an example, a set of biophysical principles are defined as below:

\begin{enumerate}
\item K\textsubscript{d2} $<$ min([K\textsubscript{d1},K\textsubscript{d3},K\textsubscript{d4},K\textsubscript{d5},K\textsubscript{d6}]) (in \underline{Python} syntax), to eusure that the complex structure of \underline{$molA$} and \underline{$molC$} is the most energetically favourable among the six possible situations listed above.

\item the smaller the value of K\textsubscript{d2}, the better from a biophysical (yet not necessarily therapeutic \cite{Yu2023intermediate,Wlfing2023intermediate}) point of view;


\item the smaller the value of K\textsubscript{d2}, the better for the \textit{in vitro} use of the antibody in diagnosis or bioprocessing, e.g., purification \cite{Perret2019};


\item \textDelta G\textsubscript{3} = 0 or $>$ 0, to ensure that there is no binding or interaction between \underline{$molB$} and \underline{$molC$}, or, there exists a repulsive force between \underline{$molB$} and \underline{$molC$};

\item K\textsubscript{d2} $<$ K\textsubscript{d5}, and K\textsubscript{d2} $<$ K\textsubscript{d4}, to ensure that the complex structure of \underline{$molA$} and \underline{$molC$} is much more stable than that of \underline{$molA$} and \underline{$molB$}, such that the antibody is able to disrupt the PD-1-PD-L1 axis;

\item \textDelta G\textsubscript{4} = 0 or $>$ 0, \textDelta G\textsubscript{5} = 0 or $>$ 0, and \textDelta G\textsubscript{6} = 0 or $>$ 0, to ensure the existence of \underline{$molC$} (antibody) suppresses the PD-1-PD-L1 axis.



\end{enumerate}



In general, drugs exert their pharmacological effects through binding to and interacting with their target(s) \cite{Ding_2021elicit,Agrawal2018-sh}, be they small molecule inhibitors or biologics, making K\textsubscript{d} and \textDelta G two cornerstones for drug discovery \& design. 
As a result, a \underline{truly general intermolecular K\textsubscript{d} calculator} (GIBAC \cite{Li2022GIBAC}) is necessary and useful to ensure adequate, accurate, precise and cost-effective knowledge of the K\textsubscript{d} and \textDelta G, which is pivotal in both early-stage drug discovery \& design (drug-target K\textsubscript{d}), and in drug repurposing (drug-target K\textsubscript{d}), and also in avoiding undue risk of toxicity mediated by drug-drug interactions (DDI, drug-drug K\textsubscript{d}) \cite{Zeng_2021DTBA,Pushpakom_2018,Han_2022DDInteractions}. 

\subsection*{Structural information is indispensable to GIBAC's accuracy}
In the real world, molecules bind to and interact with each other in their three-dimensional configurations (structures) rather than one-dimensional forms (e.g., protein sequences), making accurate calculation of K\textsubscript{d} inextricably linked to accurate and abundant structural information, which provides essential data about the three-dimensional arrangement of molecular system, including residue-specific interactions, structural (e.g., geometric \cite{Li2021PDB}) features at the binding interface, shape complementarity \cite{Agrawal2019,SchneidmanDuhovny2005}, electrostatic interactions (hydrogen bonding and salt bridging), hydrophobic interactions, VdW forces (Figure~\ref{fig:88833}), et cetera.

As a matter of fact, extractions of the experimental structural and biophysical features \cite{DiStefano2023ADMET,Li_2021Semaglutide,me_rTPA_JCBC} has long been one critical step in both physics- and AI-based approaches towards intermolecular K\textsubscript{d} calculations.
Take the physics-based Prodigy \cite{Vangone2015,Xue2016} for instance, protein-protein or protein-ligand K\textsubscript{d} is calculated using the binary interfacial features, i.e., interfacial contacts between the two interacting partners (Equation \ref{eqn2}), including interstructural hydrogen bonding features, electrostatic and hydrophobic interactions, and VdW forces (Figure~\ref{fig:88833}) between two binding molecules \cite{Bitencourt_Ferreira_2021,Fahmy_2011,Li_2020furin,Li2020abdesign}.
Prodigy \cite{Vangone2015,Xue2016} as a physics-based intermolecular K\textsubscript{d} calculator, therefore, can be described as: 

\begin{equation} 
\label{eqn5}
K\textsubscript{d} = f(ABcomplex, envPara)
\end{equation}

where \underline{$ABcomplex$} represents the complex structure of the two interacting partners.
It is obvious that Equation~\ref{eqn5} requires accurate structural information, which is (to date) not always available for any (bio)molecular system.
As a result, intermolecular K\textsubscript{d} calculators such as described by Equation~\ref{eqn5} needs to be generalized further, where \underline{structural} information (in Equation~\ref{eqn5}) is replaced with \underline{sequences}, \underline{strings}, or \underline{graphs}:



\begin{equation} 
\label{eqn8}
K\textsubscript{d} = f(molAsequence, molBsequence, envPara)
\end{equation}

or,

\begin{equation} 
\label{eqn9}
K\textsubscript{d} = f(molAstring, molBstring, envPara)
\end{equation}


or,

\begin{equation} 
\label{eqn99}
K\textsubscript{d} = f(molAgraph, molBgraph, envPara)
\end{equation}


To date, a series of computational methods are currently available that uses various AI algorithms to calculate intermolecular K\textsubscript{d} solely from protein sequence information, known as \underline{sequence-based} or \underline{sequence-only} approaches \cite{Yugandhar2014,Rube2022}.
As described in Equations~\ref{eqn8}, \ref{eqn9} and \ref{eqn99}, \underline{$molAsequence$ ($molAstring$)} or \underline{$molBsequence$ ($molBstring$)} represent sequences of amino acids, i.e., strings of letters for protein $A$ or $B$, or SMILES strings \cite{OBoyle2012,Hhnke2018,Wiswesser1968} characters to represent the chemical structure of small molecules $A$ or $B$, and \underline{$molAgraph$} represents biomolecules with PTMs, glycosylated protein for instance.

Towards the construction of a \underline{truly general intermolecular K\textsubscript{d} calculator}, here, this article for the first time calls for the establishment of a universal string-based linear or graph-based notation system for integrated lossless descriptions and representations of molecular fingerprints for all molecule types and drug modalities, with or without PTMs and/or PEMs:

\begin{enumerate}
\item PTM: glycosylation \cite{Herget2008}, phosphorylation \cite{Kawade2019,Nishi2014,Cohen2001}, fatty acid modifications \cite{Sud2012lipid,Foster2013lipid,Capecchi2020,Krenn2022}.
\item PEM: lipidation and fatty acid chain attachment(s) \cite{Resh2016}, including semaglutide (of Novo Nordisk) \cite{Li_2021Semaglutide}, insulin icodec (of Novo Nordisk) \cite{Nishimura2021}.
\end{enumerate}

With such a notation system, Equation~\ref{eqn4} can be rewritten as two sets of input parameters and two outputs, as listed in Table~\ref{tab:kd1}.

\begin{table}[htbp]
  \begin{center}
    \begin{tabular}{|l|c|c|} 
	\hline
      \textbf{Input 1} & \textbf{Input 2} & \textbf{Output}\\\hline
      $molAstring$, $molBstring$, ... & $envPara$ & \textDelta G, K\textsubscript{d} \\\hline
	  $molAgraph$, $molBgraph$, ... & $envPara$ & \textDelta G, K\textsubscript{d} \\\hline
    \end{tabular}
    \caption{A tabular description of Equations~\ref{eqn4}, \ref{eqn8}, \ref{eqn9} and \ref{eqn99}.}
\label{tab:kd1}
  \end{center}
\end{table}


In addition to such a linear/graph-based notation system, this article also calls for the development of a general forcefield for all available types of atoms (including unnatural amino acid) \cite{Hofmann2023}, and continued validation of it, to ensure accuracy and precision in the calculation of intermolecular K\textsubscript{d} for all molecule types and drug modalities \underline{with} or \underline{without} \underline{PTM} or \underline{PEM}.





\section*{Construction of an accurate and precise GIBAC}
\subsection*{A key ingredient to build a GIBAC: artificial intelligence}
By definition, calculation of intermolecular K\textsubscript{d} is a problem of biophysics and structural biology.
In light of the advantages and disadvantages of the currently available approaches (physics, AI and statistics) as discussed above, this article here puts forward a hybrid \cite{Steven2008} approach of AI and physics to ensure adequate accuracy, precision and interpretability of the intermolecular K\textsubscript{d} calculator, i.e., GIBAC \cite{Li2022GIBAC}.


\begin{figure*}[htbp]
\centering
\includegraphics[width=\textwidth]{space.png}
\caption{Relevant factors of the size of the entire molecular space \cite{Lipinski2004nature}.}
\label{fig:2222}
\end{figure*}

What's more, the space of molecular types and drug modalities is vast \cite{Kang2018}, extending beyond proteins and small molecules. 
This makes a comprehensive physics-based exploration practically impossible (Figure~\ref{fig:2222}) \cite{Chuang2022,COLEY2021133,Lipinski2004nature}. 
Another reason for the hybrid approach is the availability of various AI algorithms, including Graph Convolutional Networks (GCN), Graph Neural Networks (GNN), Graph Transformer Networks (GTN), Convolutional Neural Networks (CNN), Generative Adversarial Networks (GAN), et cetera.
Overall, the ultimate task here is for AI algorithms to accurate and precisely understand the biophysics underlying intermolecular binding \& interaction, from a structural and pharmaceutical point of view \cite{Rogers2013binding,Sugase2007binding,Li2018verapamil}.


\subsection*{Can AI be a digital crystal ball of drug discovery \& design?}
Artificial intelligence (AI) encompasses machine learning (ML) and deep learning (DL), which aim to develop intelligent machines capable of performing tasks that typically require human intelligence, such as understanding natural language, recognizing objects, solving complex problems, and making decisions \cite{Wong2023science,Paul_2021,Bender2021}.
Machine learning focuses on creating algorithms and models that can learn from available data to make predictions or decisions, while deep learning utilizes artificial neural networks with multiple layers to process and learn from data.
For AI (including ML and DL), feature extraction is a pivotal process to identify critical features from raw data, capturing important information for the learning process and improving the efficiency and accuracy of AI models \cite{Gorai2022insudesign,Wu2010aromaticisland,Gupta_2021}.


As of \today, AI is the most hyped technology in 2023 with the advent of generative AI tools such as OpenAI's ChatGPT.
In the field of drug discovery \& design, however, the current level of enthusiasm surrounding generative AI has elicited diverse reactions in the industry.
\begin{enumerate}
\item Dr. Alex Zhavoronkov (co-CEO of Insilico Medicine) thinks that AI can significantly boost the probability of success in drug development, while also agreed that the AI hype has driven valuations for newly-founded companies to levels that likely are not sustainable.
\item 'Nobody in the field is actually using AI,' said \schro ~CEO Ramy Farid in an interview. 'Describing \schro ~as a machine-learning company would be like describing \schro ~as a company that uses Microsoft Office' \cite{AndrewDunn}, while Mr. Geoffrey Porges (CFO of \schro) considers \schro ~not as an AI company, but as a proprietary software and drug company \cite{AndrewDunn}.
\item 'AI-powered' is tech's meaningless equivalent of all natural', according to Devin Coldewey of TechCrunch, where 'AI-powered' is used to create a perception of advanced technology without providing specific details about how AI is being utilized or what benefits it actually brings to drug discovery \& design.
\end{enumerate}

Overall, while the industry itself is concerned about the current AI hype 'inevitably coming back down to earth', there is still optimism about what’s next for the field \cite{AndrewDunn}, as ultimately AI will prove useful in accelerating this entire process with its ability to analyze and learn from vast amounts of data.

Taken together, drug discovery \& design itself is complex, time-consuming, and expensive process, calling for the drug R\&D community to get over the current AI hype and back to the basics of a hybrid approach of AI and biophysics \cite{Steven2008}.
With intermolecular K\textsubscript{d} calculator as an example, by leveraging ML and DL techniques, AI can analyze and learn from molecular sequence and structure data, including residue-specific interactions, interstructural features at the binding interface \cite{Li2021PDB,Li_2020furin,Li_2021Semaglutide}, shape complementarity \cite{Agrawal2019,SchneidmanDuhovny2005}, electrostatic interactions (hydrogen bonding and salt bridging), hydrophobic interactions, VdW forces (Figure~\ref{fig:88833}) \cite{Smith_2021}.

Consequently, this article here
\begin{enumerate}
\item argues that AI can indeed act as a digital crystal ball of drug discovery \& design, provided that it is used in combination with experimental insights from structural biology, biophysics, pharmacology \cite{Li2018verapamil}, et cetera.
\item puts forward a hybrid approach of AI and biophysics to ensure adequate accuracy, precision and interpretability of a truly general intermolecular K\textsubscript{d} calculator, i.e., GIBAC \cite{Li2022GIBAC}, as described in Equation~\ref{eqn4} and Table~\ref{tab:kd1}.
\end{enumerate}


\subsection*{Construction of GIBAC: experimental data and tools}
AI algorithms rely on huge amounts of data to learn, train, and improve their performance continuously, where its quantity and quality inextricably linked to the performance of the AI model \cite{Berman2003Announcing,Li2020EUT,Evans_2021}. 
As charted out previously in \cite{Li2022GIBAC}, therefore, the construction of GIBAC requires two key ingredients, i.e., \underline{data} and \underline{algorithm}, and is to follow a roadmap as defined by Equation~\ref{eqn10} \cite{Li2022GIBAC}.


\begin{equation} 
\label{eqn10}
data + algorithm = model
\end{equation}

To ensure a GIBAC with adequate accuracy and precision, a substantial amount of experimental data with reasonable accuracy is crucial, including Protein Data Bank (PDB) \cite{Berman2003Announcing,PDB2018}, PDBbind and BindingDB \cite{PDBbind,Liu_2007BindingDB,Greenidge_2012}, CASF2016 and CASF2013 databases \cite{Su20182016,Li20182013}, DUD-E \cite{Mysinger2012DUDE}, ChEMBL \cite{Liu_2015PDBbindbind}, DrugBank \cite{Law2013DrugBank}, CSAR \cite{Dunbar2013}, MUV \cite{Rohrer2009}, PDBbind-CN \cite{Wang2004PDBbind}, Antigen-Antibody Interaction Database (AgAbDb) \cite{KulkarniKale2014}, NIH molecular libraries initiative database \cite{Austin_2004}, the international ImMunoGeneTics information system (IGMT) \cite{Manso2021}, et cetera.

Moreover, a wide range of wet-lab tools are also available to ensure continued accumulation of experimental data in structural biology, biophysics, medicinal and computational chemistry, drug discovery \& design, including isothermal titration calorimetry (ITC) \cite{freire2009biophysical,Johnson_2021,Vel_zquez_Coy_2004}, surface plasmon resonance (SPR) \cite{sauer2008surface}, nuclear magnetic resonance (NMR) spectroscopy \cite{ernst1990principles}, cryogenic electron microscopy (cryo-EM) \cite{deOliveira2021}, fluorescence resonance energy transfer (FRET) \cite{haas2001fluorescence}, microscale thermophoresis \cite{seidel2013microscale}, differential scanning fluorimetry \cite{pantoliano2001high}, X-ray crystallography \cite{drenth2007principles}, mass spectrometry \cite{smith2002proteomics}, bio-layer interferometry \cite{rich2009biolayer}, et cetera.

Nonetheless, experimental data and tools alone are useful but insufficient in the construction of a GIBAC with adequate accuracy and precision (Equation~\ref{eqn4}) \cite{Li2022GIBAC}.



\subsection*{Construction of GIBAC: computational data and tools}
As mentioned earlier, exploring the entire molecular space with physical calculation alone is practically impossible due to its size (Figure~\ref{fig:2222}) \cite{Chuang2022,COLEY2021133,Lipinski2004nature}.
In the construction of GIBAC, a hybrid approach combining AI and physics is therefore described here. 
However, AI algorithms can still struggle to accurately and precisely calculate intermolecular K\textsubscript{d} data that are different from the training data set, which only covers a limited portion of the molecular space (Figure~\ref{fig:2222}) \cite{Kang2018}.
This is where synthetic data and its generators (i.e., computaional tools) come in \cite{Chen2021Synthetic,Jadon2023}, including: 

\begin{enumerate}
\item computational structural data from AlphaFold database \cite{Callaway_2022};
\item synthetic (both apo and complex) structural data generators \cite{Waterhouse2018,Jalily_Hasani_2017,Pettersen2004,Tong2021};
\item molecular docking tools \cite{Agu2023dock}.
\item synthetic K\textsubscript{d} data generators \cite{Vangone2015,Xue2016,zheng2017calculating,deng2012predicting,du2017molecular,li2014computational};
\item molecular dynamics simulations tools \cite{karplus2002molecular,AKL,karplus2002molecular};
\item side chain placement and energy minimization algorithms \cite{Canzar2011} to incorporate structural arrangement information of PTMs and PEMs into currently available structural models.
\end{enumerate}




\subsection*{High-throughput generation of synthetic data for GIBAC}
To illustrate how synthetic structural and biophysical data is generated with reasonable accuracy by currently available computational tools, Modeller \cite{Webb2020} and the PD-1/PD-L1 complex structure \cite{Lin2008papier,Lin2008Complex} are used as an example to generate synthetic apo and complex structural data. 


\begin{figure*}[htb!]
\centering
\includegraphics[width=0.6\textwidth]{ecloud.png}
\caption{An illustration of the generation of synthetic structural data through the electron cloud model of a hydrogen atom. In this figure, the black solid circle represents the hydrogen nuclei, the scattered black dots represent the possible locations of hydrogen's electron, while $R$ represents the distance between the electron (synthetic data) and the proton (i.e., nucleus, experimental data) of hydrogen.}
\label{fig:ecloud}
\end{figure*}




Specifically, the PD-L1 length is 222 amino acid residues, and the PD-1 length is 134, and their combined length is 356. 
The experimental PD-1/PD-L1 complex structure (PDB access code: 3BIK) is used as a starting point to generate synthetic data by introducing a limited number ($k$) of site-directed mutations to ensure reasonable accuracy of the synthetic data.
With PD-1/PD-L1 complex structure as a template, the size ($s$ in Equation~\ref{eq:classic1}) of the synthetic structural data is described as below,

\begin{equation}
s = g(n,k) = \frac{n!}{k!(n-k)!} \times 20^k
\label{eq:classic1}
\end{equation}



where $n$ (Equation~\ref{eq:classic1}) represents the length of PD-1, PD-L1 or the complex of the two, and $k/n$ $<$ 5\% (Equation~\ref{eq:classic1}) ensure the overall reasonable accuracy of the synthetic structural data.
Thus, 

\begin{enumerate}
\item for PD-L1 (an apo structure), $s=g(222,11)= 1.2571 \times 10^{18}$;
\item for PD-1 (an apo structure), $s=g(134,6)=7.1779 \times 10^9$;
\item for PD-L1/PD-1 (a complex structure), $s=g(356,17)=4.5190 \times 10^{28}$.
\end{enumerate}




\begin{figure*}[htb!]
\centering
\includegraphics[width=0.7\textwidth]{rainbow268mergarainbow37merga.png}
\caption{A colored depiction of a variety of synthetic data sets, including experimental data (atomic nuclei of hydrogens, Figure~\ref{fig:ecloud}), synthetic data (atomic electron clouds of hydrogens, Figure~\ref{fig:ecloud}) and uncharted territories, i.e., the ocean, the white region.}
\label{fig:6666666666}
\end{figure*}


In short, therefore, for one experimental structure ($n = 134$), a total of 7.17 $\times$ $10^9$ synthetic apo structures are able to be generated based on the experimental structure of PD-1 with reasonable accuracy, i.e., those synthetic apo structures is at least 95\% homologous to its experimental template, i.e., the experimental structure of PD-1.

Here, the generation of synthetic structural data is similar to the distribution of the electron cloud model of a hydrogen atom, as shown in Figure~\ref{fig:ecloud}.
A hydrogen atom is composed of a single negatively charged electron, moving around the positively charged proton which is the nucleus of the hydrogen atom, where $R$ (Figure~\ref{fig:ecloud}) represents the distance between the electron (synthetic data) and the proton (i.e., nucleus, experimental data) of hydrogen, the larger the value of $R$ (Figure~\ref{fig:ecloud}), the lower the density of the electron cloud of the hydrogen atom, i.e.,, the higher the value of $k/n$ (Equation~\ref{eq:classic1}), the lower the homology between the synthetic data and its experimental template, the lower the accuracy of the synthetic structural data.


Taken together, as shown in Figure~\ref{fig:6666666666}, the white region represents the experimentally and comutationally uncharted territories (i.e., the ocean, Figure~\ref{fig:6666666666}), while the rest of Figure~\ref{fig:6666666666} consists of islands of synthetic data, with the atomic nuclei representing experimental data, and the electron clouds representing the synthetic data.
Thus, Figure~\ref{fig:6666666666} is indeed a collection of \underline{mini GIBACs} (Figure~\ref{fig:6666666666}) for target-specific calculations and ranking of K\textsubscript{d} and \textDelta G.
Specifically, the hydrogen electron cloud models in Figure~\ref{fig:6666666666} is able to represent apo and complex structural data (both experimental and synthetic), and also K\textsubscript{d} and \textDelta G data, both experimental and synthetic, where the white regions of Figure~\ref{fig:6666666666} corresponds to uncharted territories of the K\textsubscript{d} and \textDelta G data.
To train an AI- and physics-based intermolecular K\textsubscript{d} calculator, therefore, K\textsubscript{d} is predefined to be 1 M for white regions of Figure~\ref{fig:6666666666}, until new experimental K\textsubscript{d} data \cite{freire2009biophysical,Johnson_2021,Vel_zquez_Coy_2004} is updated in databases as mentioned above \cite{PDBbind,Liu_2007BindingDB,Greenidge_2012}.

Hence, in spite of the size of the entire molecular space (Figure~\ref{fig:2222}) \cite{Kang2018}, the size of the synthetic data space is also quite considerable, and with a variety of synthetic data generators such as Modeller \cite{Webb2020} and Prodigy \cite{Vangone2015,Xue2016,zheng2017calculating}, it still is conceivable for currently available AI algorithms (or new ones in future) to learn, train, and improve their performance continuously, in the sense that AI algorithms keep learning the biophysics underlying molecular folding \cite{Li2022Unifold,Goldbach2019}, binding \& interaction \cite{Rogers2013binding,Sugase2007binding}, and structural action mechanism \cite{Li2018verapamil}.
Overall, the situation here is a bit like the generation of a human molecular structural binding and interacting atlas, akin to Human BioMolecular Atlas Program (HuBMAP), a global initiative that aims to assemble spatial maps of biomolecules, including RNA, proteins, and metabolites at single-cell resolution \cite{Jain2023HuBMAP,Jain2023HuBMAP2019}.


\section*{Application of GIBAC in drug discovery \& design}
In recent years, the application of AI (including DL and ML) is becoming increasingly popular in drug discovery \& design \cite{DiStefano2023ADMET,Cheng2023missense,CarracedoReboredo2021,Sadybekov2021sython}, particularly in lead optimization and ADMET studies, including carcinogenicity, hepatotoxicity, et cetera \cite{DiStefano2023ADMET}.
For instance, \underline{VenomPred} is a promising solution for deriving structural toxicophores and assessing the safety profile of compounds.


\begin{figure*}[htb!]
\centering
\includegraphics[width=0.9\textwidth]{gibacuse1.png}
\caption{A variety of the practical application of GIBAC, including  
SME (small molecule inhibitor), Ab (antibody), Ag (antigen), XDC (antibody-drug conjugate (ADC), peptide-drug conjugate (PDC), aptamer-drug conjugate (ApDC) \cite{Kinghorn2017}), rPeptide (recombinant peptide drug), rProtein (recombinant protein drug), intrabodies \cite{Lobato2004}, proteolysis-targeting chimeric molecules (PROTAC) \cite{Weng2020,Weng2022}, drug-drug interaction (DDI) \cite{Zhang2017DDI}, chimeric antigen receptor T (CAR-T) cell therapy \cite{Rahnama2022}.}
\label{fig:8}
\end{figure*}



Here, given the definition of GIBAC as in Equation~\ref{eqn4} and Table~\ref{tab:kd1}, the discussion of the practical application of GIBAC in drug discovery \& design focuses on intermolecular binding and interactions.
In biological systems, there are a wide range of intermolecular binding pairs, including including enzyme-substrate \cite{Fakhrai_Rad_2000,GonzlezCasimiro2021}, ligand-receptor \cite{ferreira2019computational,Petukh2013}, protein-protein \cite{Vangone2015,Jubb2017}, ion channel-drug \cite{Li2020gaga,Elliott2011}, antibody-antigen \cite{Wu_2021,Mason_2021,Li2020abdesign}, DNA-protein \cite{Lane1992}, RNA-protein \cite{Sola2011,Kinghorn2017}, RNA-RNA \cite{Sola2011}, hormone-receptor \cite{Iida_2004}, coenzyme-substrate \cite{Zhang2018coenzyme}, metal ion-protein \cite{Ham2017,Li2018verapamil}, lipid-protein \cite{Rhee2001}, et cetera.
By definition, GIBAC can find its use for any binding pair involved in the molecular pathogenesis of human diseases, infectious or non-communicable, including: 
\begin{enumerate}
\item ligand-receptor binding, e.g., insulin binding to its receptor \cite{RahuelClermont1997};

\item protein-protein interaction, e.g., TNF-\textalpha ~binding to its receptor;

\item ion channel-drug interaction, e.g., verapamil binding to Ca\textsubscript{V}1.2 \cite{Li2018verapamil};


\item antibody-antigen binding, e.g., Keytruda binding to PD-1 \cite{Robert2015keytruda};


\item self-association and aggregation, e.g., formation of amyloid-\textbeta ~oligomer \cite{Mroczko2017,Pinheiro_2021}.

\end{enumerate}




\begin{figure*}[htb!]
\centering
\includegraphics[width=0.9\textwidth]{insu.png}
\caption{An molecular binding and interaction network of insulin. In this figure, IR-A, IR-B, IGF-1R and IDE represent two isoforms of insulin receptor \cite{Hubbard2013IR}, insulin-like factor 1 receptor \cite{Cao2021IGFReceptor,Vigneri2010} and insulin degrading enzyme (IDE) \cite{Fakhrai_Rad_2000}, respectively. }
\label{fig:3insu}
\end{figure*}


Overall, GIBAC is useful for drug discovery \cite{Trosset_2019}, drug design \cite{Mervin_2021}, lead optimization \cite{Cavasotto_2020}, drug repurposing \cite{Sam2017}, and DDI prediction \cite{Zhang2017DDI}.
Take insulin for instance \cite{RahuelClermont1997}, for which a series of analogues have been designed, synthesized and therapeutically tested towards a better glycaemic control for diabetic patients \cite{Hua2008insudesign,Weiss2013Design}.
After injection, insulin exists as multi-forms, including 
multi-hexamer, 
di-hexamer,
hexamer,
dimer or 
monomer \cite{Cheng2021100years}, as shown in Figure~\ref{fig:3insu}.
While insulin mainly binds to its own specific receptor, the insulin receptor (IR, Figure~\ref{fig:3insu}), it also binds to IGF-1R \cite{Varewijck2012,Zhang2020IGF-1Rpaper,Zhang2020IGF-1R}  \cite{Fakhrai_Rad_2000} for an increased mitogenic potential (e.g., continued growth of pre-existing neoplasms \cite{Gallagher2011,Sandow2009mitogenic}),  and also binds to IDE for enzyme-mediated degradation of insulin with cells.
Moreover, accumulating evidence has led to a concept of the physiological roles of IR isoforms, where predominant IR-A expression may be important for prenatal growth and development, while IR-B expression has a more important role in metabolic insulin action in adults \cite{Belfiore2017}. 

Thus, with GIBAC as a K\textsubscript{d}-based search engine for therapeutic candidate(s), the problem of drug discovery \& design here is to be a matter of plugging into the search engine the name(s) of insulin's binding partner(s) and their associated K\textsubscript{d} and \textDelta G value(s) or value range(s) specified by the user(s) of GIBAC.
Afterwards, the search engine walks traversely through all molecular types and drug modalities, and returns a list of K\textsubscript{d}-ranked molecular candidates, including but not limited to insulin analogues available to date, for continued improvement of glycaemic control of diabetic patients.

Take ADC for another example, for which internalization is crucial for delivering the cytotoxic drug into the cancer cells \cite{Kang2018,Wan2019internalize}. 
To achieve effective internalization and drug release, a high binding affinity between the antigen and antibody is desired. This affinity ensures strong and stable interaction between the ADC and its cancer cell receptor, increasing the likelihood of efficient internalization. 
Moreover, a high binding affinity enhances ADC retention within the cancer cells, maximizing its cellular exposure to the cytotoxic drug and boosting the therapeutic efficacy of ADC. 
Thus, GIBAC here is able to act as a search engine for antibodies with high (as high as biophysically possible) affinity and specificity to target antigens, facilitating efficient ADC internalization, minimizing off-target effects, and enhancing therapeutic efficacy \cite{Jin2022ADC}.



\section*{Technical challenges of GIBAC: openness is the key}
As mentioned above, an exhaustive exploration of the entire molecular space is practically impossible (Figure~\ref{fig:2222}) \cite{Chuang2022,COLEY2021133}, despite the abundance of structural and biophysical data and the tools for synthetic data generation. 

\begin{figure*}[htbp]
\centering
\includegraphics[width=0.8\textwidth]{designmaketest11.png}
\caption{An iterative process to build and refine an AI- and physics-based intermolecular K\textsubscript{d} calculator (i.e., GIBAC) with adequate accuracy and precision.}
\label{fig:4hshhgfd}
\end{figure*}



Nonetheless, AI algorithms alone are insufficient to build a GIBAC with adequate accuracy and precision, such that it is able to find its use in drug discovery \& design. 
Therefore, this article here proposes an \underline{open} strategy (Figure~\ref{fig:4hshhgfd}), because


\begin{enumerate}
\item the AI-training and retraining processes require a \textbf{huge} amount of data with reasonable accuracy, variety and cost.

\item openness (Figure~\ref{fig:4hshhgfd}) in data, algorithms, source code, and AI models is essential for promoting transparency, reproducibility, and collaboration within the whole community of drug discovery \& design, and facilitates the continued improvement of the performance of GIBAC.


\item the accuracy and the precision of GIBAC is inextricably linked to a variety of interlinked factors, which is to be discussed in the next section.
\end{enumerate}

\section*{Five interlinked factors linked to the accuracy and the precision of GIBAC}
As two cornerstones for both experimental measurement and computational calculation, \textbf{accuracy} and \textbf{precision} help ensure the reliability and validity of the results obtained.
As an AI- and physics-based intermolecular K\textsubscript{d} calculator, GIBAC is no exception here.
Take the K\textsubscript{d} of antigen-antibody binding for example, where a study of anti-4-1BB monoclonal antibody found that the optimal antibody binding affinity was found to be dependent on its intended application. 
For blocking antibodies, a high affinity is most effective. However, for agonist antibodies, the results show that an intermediate affinity works best \cite{Yu2023intermediate,Wlfing2023intermediate}.




To further highlight the importance of accuracy and precision, this article looked into history and back with experimental measurements of three physical constants: the Planck constant, the Boltzmann constant and the gyromagnetic ratio.
Take the Planck constant for the first example, which provides the foundation of quantum physics, and was experimentally measured to be 6.626 069 57 $\times$ 10\textsuperscript{-34} Joule seconds \cite{Steiner2012Planck}.
As the second example, the Boltzmann constant was experimentally measured to be 1.380 649 $\times$ 10 \textsuperscript{-23} Joule Kelvin\textsuperscript{-1} \cite{Pitre2019}.
Lastly, the gyromagnetic ratio of the proton (hydrogen nucleus) in water (\textgamma) is measured to be 2.67513 $\times$ 10\textsuperscript{8} Weber\textsuperscript{-1} m\textsuperscript{2} sec\textsuperscript{-1} with a total probable error of 5 in 10\textsuperscript{6} \cite{VIGOUREUX1963,Kibble1979}.
Taken together, these three examples highlight the importance of accuracy and precision in both experimental measurement and computational prediction.
For drug discovery \& design in particular, intermolecular K\textsubscript{d} calculations need to be both accurate (close to its true value) and precise (reproducible \& consistent, with a total probable error similar to that of \textgamma ~ideally), such that informed decisions could be made during the drug discovery process, to ensure \textbf{accurate} and \textbf{precise} selection of the most promising drug candidates in the earliest stage of R\&D.


\begin{figure*}[htb!]
\centering
\includegraphics[width=0.8\textwidth]{gibachallenge1.png}
\caption{Key factors related to the accuracy and the precision of GIBAC.}
\label{fig:77733key}
\end{figure*}



Interestingly, the key factors related to GIBAC's accuracy and precision (Figure~\ref{fig:77733key}) are interlinked to one another.
Starting from experimental structure determination, where experimental techniques such as X-ray crystallography, NMR spectroscopy, cryo-electron microscopy (cryo-EM), or Cryo-electron tomography (cryo-ET) \cite{Li2023cryo-ET,Eisenstein2023} are used to record direct experimental data, e.g., X-ray diffraction pattern, chemical shift, cryo-EM images, or cryo-ET tomographies. 
Afterwards, structure calculation methods are used to refine and interpret the experimental data, which involve mathematical algorithms and computational structural modeling to provides the initial atomic coordinates, with which the forcefield Figure~\ref{fig:77733key} describes the interactions between atoms.
Subsequently, energy minimization optimizes the atomic positions to find the lowest energy configuration, and to generate a accurate and complete representation of the (bio)molecule's structure.




\begin{figure*}[htb!]
\centering
\includegraphics[width=0.8\textwidth]{PTM.png}
\caption{A collection of different post-translational modifications (PTMs).}
\label{fig:77368PTM}
\end{figure*}






Specifically, these factors (forcefield, structural information, PTM, PEM, \pKa, Figure~\ref{fig:77733key}) are interlinked, in the sense that 

\begin{enumerate}
\item both accurate \pKa~ values and forcefield is required for the energy minimization step in structural calculation for experimental structure determination.
\item for the description of a molecular structure system, a range of factors are required, including forcefield, atomic coordinates, structural information, PTM, PEM, \pKa.
\item physics-based \pKa ~calculation requires accurate structural information of protein, PTM, PEM, protein with PEM, or protein with PEM \cite{Nishi2014,Cohen2001}.
\item accurate structural information and \pKa~ values are useful for continued improvement of the accuracy of forcefield.
\item not only does biomolecules (e.g., protein) have site-specific \pKa ~values, but PTM- or PEM-related chemical groups/moieties (e.g., glycans and lipids, Figure~\ref{fig:77368PTM}) also have site-specific \pKa ~values. For instance, DMPG's and DMPC's head groups have intrinsic \pKa ~values of $\sim$ 3.5 and $\sim$ 1.0, respectively \cite{Marsh2013lipids,AKL}.
\end{enumerate}




\section*{Towards a truly general site-specific \pKa ~calculator}
To accurately and precisely calculate intermolecular K\textsubscript{d}, a series of factors need to be considered, including forcefield, atomic coordinates, structural information, PTM, PEM, \pKa~ (Figure~\ref{fig:77733key}). 
Below, site-specific \pKa ~is taken as an example to explain why and how it is linked to the accuracy and the precision of GIBAC, and how to tackle this \pKa ~issue.

As is well known, electrostatics (salt bridges, hydrogen bonds, charge-charge attraction or repulsion, et cetera) plays an important role in biomolecular structure and function \cite{pH,Kumar2010,enzyme}, in which a specific set of amino acid residues are of particular relevance due to the variable protonation states of their ionizable side chains \cite{Platzer2014,hansen2014measurement}. 
In an acid-base equilibrium, the dissociation constant is usually written as $K_a=\frac{[A][H]}{[HA]}$, where $[H]$, $[A]$ and $[HA]$ represent the concentrations of protons, the unprotonated and the protonated forms of a titrateable group, respectively. For the titrateable group, $K_a$ is a measure of its acidity, the higher the $K_a$, the higher the acidity. 
$K_a$ can also be expressed as \pKa ~with the equation $pK_a=-log_{10}{K_a}$, similar to the way pH is defined \cite{Li2017Gravity}. 


To address the \pKa ~issue, this article here proposes a truly general site-specific protonation constant (\pKa) calculator (GSPCC, Equation~\ref{eqn123GSPPC}), similar to the way GIBAC is defined in Equation~\ref{eqn4}:


\begin{equation} 
\label{eqn123GSPPC}
\pKa ~= f(molecules, envPara)
\end{equation}


Of note, a protonation-deprotonation equilibrium is indeed a ligand-binding reaction with the ligand being a proton (H\textsuperscript{+}).
As shown in Equation~\ref{eqn123GSPPC}, therefore, GSPCC is essentially a specific form of GIBAC, where proton (a sub-atomic particle) and the ionizable chains constitute the two binding partners.
Furthermore, with pH and \pKa, protonation state/proton occupancy ($\theta$) can be defined as in Equation~\ref{eqn223theta}, to describe the degree to which the entire population of the ionizable side chains are protonated \cite{Li2017Gravity}. That is, when $pH = \pka$, the ionizable side chain is half protonated and half deprotonated, i.e., $\theta = 50\%$, assuming that the site-specific protonation/deprotonation follows the classic Henderson-Hasselbalch equation.

\begin{equation} 
\theta=\frac{10^{(pK\raisebox{-.4ex}{\scriptsize a} - pH)}}{1 + 10^{(pK\raisebox{-.4ex}{\scriptsize a} - pH)}}=\frac{10^{pK\raisebox{-.4ex}{\scriptsize a}}}{10^{pK\raisebox{-.4ex}{\scriptsize a}} + 10^{pH}}
\label{eqn223theta}
\end{equation}


\begin{table}[htbp]
  \begin{center}
    \begin{tabular}{|l|c|c|} 
	\hline
      \textbf{Input 1} & \textbf{Input 2} & \textbf{Output}\\\hline
      $molAstring$, $molBstring$, ... & $envPara$ & \pKa \\\hline
      $molAgraph$, $molBgraph$, ... & $envPara$ & \pKa \\\hline	  
    \end{tabular}
    \caption{A tabular description of GSPCC (Equation~\ref{eqn123GSPPC}).}
    \label{tab:pkaGSPCC}
  \end{center}
\end{table}


Since GSPCC is essentially a specific form of GIBAC, the construction of the two follow the same roadmap as defined by Equation~\ref{eqn10} \cite{Li2022GIBAC}.
To date, \pka ~can be measured by various experimental methods, including UV-spectroscopy \cite{box2003high}, a pH-metric approach \cite{avdeef1993ph}, a capillary electrophoresis-based approach \cite{ishihama2002rapid} and NMR spectroscopy \cite{webb2011remeasuring,pelton1993tautomeric,basel,Li2017Gravity}
In addition to experimental measurements of \pKa, computational tools have also been developed, such as protein \pKa ~prediction with ML \cite{Cai2021,Xiong2021} and physics-based \pKa ~calculator, which is able to act as synthetic \pKa ~data generators such as PROPKA \cite{Olsson_2011}.


In short, given the importance of electrostatics in intermolecular binding affinity \cite{Li_2020furin,Li2017SMA}, it is necessary to first build a GSPCC with adequate accuracy and precision to build a GIBAC with adequate accuracy and precision, to eusure its applicability in drug discovery \& design.





\section*{Technical limitations of GIBAC: K\textsubscript{d}, K\textsubscript{on}, K\textsubscript{off}, ...}
While drug-target K\textsubscript{d} is an essential parameter for drug discovery \& design, it is but one of the many aspects of drug R\&D.
For instance, K\textsubscript{d} and \textDelta G has been used to indicate the efficacy of a drug. 
However, this is not always the case. It has recently been shown that \underline{residence time} ($RT$) is another better indicator of efficacy than K\textsubscript{d} for some systems \cite{Costa2016,Copeland2015}.


In biophysics, the relationship between K\textsubscript{d} (dissociation constant), K\textsubscript{on} (association rate constant), and K\textsubscript{off} (dissociation rate constant) can be described as $K\textsubscript{d} = K\textsubscript{off}/K\textsubscript{on}$, while $RT$ refers to the average time a molecule spends bound to its target before dissociation. 
Thus, The $K\textsubscript{on}$ represents the rate at which a molecule associates with its target, while the $K\textsubscript{off}$ represents the rate at which the molecule dissociates from the target. 
\begin{table}[htbp]
  \begin{center}
    \begin{tabular}{|l|c|c|} 
	\hline
      \textbf{Input 1} & \textbf{Input 2} & \textbf{Output}\\\hline
      $molAstring$, $molBstring$, ... & $envPara$ & K\textsubscript{d}, K\textsubscript{on}, K\textsubscript{off}, RT \\\hline
      $molAgraph$, $molBgraph$, ... & $envPara$ & K\textsubscript{d}, K\textsubscript{on}, K\textsubscript{off}, RT \\\hline	  
    \end{tabular}
    \caption{A tabular description of a general intermolecular biophysics calculator.}
    \label{tab:konoffgibc}
  \end{center}
\end{table}


As a result, while GIBAC is defined as in Equation~\ref{eqn4}, additional output parameters are necessary for the construction of a general intermolecular biophysics calculator (GIBC) based on the hybrid $AI+physics$ approach, as outlined in Table~\ref{tab:konoffgibc}.
To this end, a set of biophysical parameters (in addition to the intermolecular K\textsubscript{d}) allows a further generalization of the GIBAC originally proposed in \cite{Li2022GIBAC} and defined in Equation~\ref{eqn4} and Table~\ref{tab:kd1}, leading to the concept of GIBC (Table~\ref{tab:konoffgibc}), which is to be one biophysics-based future direction of GIBAC.



\section*{Two future directions of GIBAC: AI + biophysics}
This article puts forward an physics+AI hybrid approach, which is necessary for the construction of a GIBAC with adequate accuracy and precision to be used in drug discovery \& design, and which aims at increasing the probability of successfully discovering new drugs, while reducing discovery costs and timelines.


\begin{figure*}[htb!]
\centering
\includegraphics[width=\textwidth]{futuredirectionofgibac.png}
\caption{Two future directions of GIBAC, i.e., a truly general intermolecular biophysics calculator (GIBC) and a \underline{chatbot} for drug discovery \& design.}
\label{fig:futuredirection}
\end{figure*}




\begin{table}[htbp]
  \begin{center}
    \label{tab:alles}
    \begin{tabular}{|l|c|c|} 
	\hline
      \textbf{Input 1} & \textbf{Input 2 ($envPara$, et cetera)} & \textbf{Output}\\\hline
      $molAstring$, $molBstring$, ... & pH, \pKa, $T$, ionic strength, ... & \textDelta G, K\textsubscript{d}, K\textsubscript{on}, K\textsubscript{off}, RT \\\hline
      $molAgraph$, $molBgraph$, ... & pH, \pKa, $T$, ionic strength, ... & \textDelta G, K\textsubscript{d}, K\textsubscript{on}, K\textsubscript{off}, RT \\\hline	  
    \end{tabular}
    \caption{A tabular description of \underline{a truly general intermolecular biophysics calculator}.}
	\label{tab:konoffnew}
  \end{center}
\end{table}



Therefore, in addition to an AI- and physics-based GIBC as defined in Tables~\ref{tab:konoffgibc} and \ref{tab:konoffnew}, this article further discusses the potential of the intermolecular K\textsubscript{d} calculator-based search engine (i.e., GIBAC) to act as a ChatGPT-like chatbot for drug discovery \& design, which is able to accurately, precisely and efficiently handle questions as below: 

\begin{enumerate}
\item for \underline{mini GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a K\textsubscript{d}-ranked list of insulin analogues which binds to IR with a K\textsubscript{d} within a desired value range? 
\item for \underline{mini GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a list of insulin analogues which does not bind to IGF-1R or IDE?


\item for \underline{mini GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a list of insulin analogues which does not form dimer or hexamer?


\item for \underline{mini GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a K\textsubscript{d}-ranked list of insulin analogues, which form highly stable dimer or hexamer?


\item for \underline{GIBAC} (Figure~\ref{fig:6666666666}), can you generate a K\textsubscript{d}-ranked list of therapeutic candidates which targets X (i.e., drug target) of different species, e.g., X of human \cite{Zimmermann1981}, X of cat/dog \cite{Enomoto2019}, X of horse \cite{Kalnins2020}, et cetera?


\item for \underline{GIBAC} (Figure~\ref{fig:6666666666}), can you generate a K\textsubscript{d}-ranked list of therapeutic candidates which targets X, Y and Z? such as vorolanib \cite{Sheng2023Vorolanib} or retatrutide \cite{Urva2023}?



\item for \underline{GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a K\textsubscript{d}-ranked list of therapeutic small molecule candidates \cite{Arkin2014,Erlanson_2000} which targets $X$ and possesses a K\textsubscript{d} within a specified K\textsubscript{d} range, and which does not target $Y$ or $Z$?

\item for \underline{GIBAC} (Figure~\ref{fig:6666666666}), can you please generate a list of prefusion-stabilizing molecules or introduce a set of mutations into the S protein to stabilize its conformation in the prefusion state?  as locking the conformation of the S protein of SARS-CoV-2 into the premembrane-fusion state is essential for subunit vaccine design \cite{Li_2020furin,Gonzalez2023,Shi2023postfusion}. 

\end{enumerate}

In light of the technical limitations of \underline{GIBAC}, it is also to be used in combination with other parameters (e.g., LogD \cite{Wang2023LogD} or synthesizability \cite{Ertl2009}, et cetera) in drug discovery \& design in future.
Of one note, GIBAC in itself is no more than a tool, and biology and biophysics are the science underlying all processes in living organisms.
As a result, in case GIBAC is actually to be used in future as a chatbot for drug discovery \& design, its user may want to follow a \underline{science first, technology second} principle, and first try to understand the physiology and the pathophysiology for target(s)-specific drug discovery \& design.



\section*{Conclusion and discussion}
To sum up, this article puts forward a conceptual and practical framework (Figure~\ref{fig:88833}) of \underline{a truly general intermolecular binding affinity calculator}, including its inception, definition, construction, practical applications, technical challenges and limitations, and future directions (GIBAC \cite{Li2022GIBAC}).


\begin{figure*}[hbp!]
\centering
\includegraphics[width=0.9\textwidth]{framework.png}
\caption{GIBAC as a K\textsubscript{d}-based search engine for drug discovery \& design.}
\label{fig:88833}
\end{figure*}




As is known throughout the industry, drug R\&D is a multi-purpose expensive and arduous task \cite{Lipinski_2001,Zheng2013,Zheng2018}.
It is also a highly regulated process that can take years and many millions to billions of dollars just to get rejected \cite{Wong2018MIT}.
As of \today, the average time to develop a drug takes between 10 years to 15 years, which includes discovery, lead identification and optimization, preclinical testing, clinical trials, regulatory review, and final regulatory approval.

As the beginning part of the drug R\&D, drug discovery \& design itself is still a lengthy, costly, difficult, and inefficient yet pivotal process \cite{Zhou2017DDD}.
Given this, only a GIBAC with adequate accuracy, precision, and efficiency is able to create a paradigm shift \cite{Subramaniam2022} from structure-based molecular generation to a search engine for drug discovery \& design and find its practical application in the pharmaceutical industry \cite{Gupta_2021, Bonvin_2021}, aiming at increasing the probability of successfully discovering new drugs, while reducing discovery costs and timelines.

Finally, this article argues that the time is now ripe for the construction of such a GIBAC to be listed on the agenda of the drug R\&D community, including in particular structural biologists and biophysicists, medicinal and computational chemists, drug discoverers \& designers, and algorithm designers, in light of





\begin{enumerate}
\item the crucial roles of K\textsubscript{d} and \textDelta G in drug discovery \& design in continued optimization of drug-target interactions to facilitate the development of therapeutic agents with improved efficacy and safety \cite{Zeng_2021DTBA,Pushpakom_2018,Han_2022DDInteractions}.

\item the recent progresses of AI algorithms in drug discovery \& design \cite{Zhang2023,Yang_2020}.

\item a large amount of data \cite{LUTOMSKI20222617,Jumper_2021}, and a variety of tools \cite{Tong2021,pelton1993tautomeric,pronk2013gromacs,SchneidmanDuhovny2005} in structural biology, bioinformatics, biophysics, drug discovery \& design, et cetera.

\item the democratization of high performance computing (HPC) since the beginning of this century and its continued evolution towards scalable quantum computing \cite{Kendon2020quantum} and perhaps computation beyond silicon \cite{IlanGur} in future.

\item the framework (Figure~\ref{fig:88833}) for GIBAC \cite{Li2022GIBAC} as described here.
\end{enumerate}



\section*{Acknowledgment}
The author is grateful to the communities of structural biology, biophysics, medicinal and computational chemistry and algorithm design, for the continued accumulation of knowledge and data for drug discovery \& design, and for the continued development of tools (both hardware, software and algorithm) for drug R\&D, which contributed immensely to the inception of GIBAC, and which are also to contribute to the construction of an accurate, precise and efficient GIBAC for drug discovery \& design.






\section*{Ethical statement}
No ethical approval is required.


\section*{Conflict of interest}
None.


\section*{Declaration of generative AI and AI-assisted technologies in the writing process}
During the preparation of this work, the author used OpenAI's ChatGPT in order to improve the readability of the manuscript, and to make it as concise and short as possible. 
After using this tool, the author reviewed and edited the content as needed and takes full responsibility for the content of the publication.



\clearpage

\authorcontributions{Conceptualization, W.L.; methodology, W.L.; software, W.L.; validation, W.L.; formal analysis, W.L.; investigation, W.L.; resources, W.L.; data duration, W.L.; writing--original draft preparation, W.L.; writing--review and editing, W.L.; visualization, W.L.; supervision, W.L.; project administration, W.L.; funding acquisition, not applicable.}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\funding{This research received no external funding.}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\conflictsofinterest{The author declares no conflict of interest.} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% optional

%=====================================
% References, variant B: external bibliography
%=====================================
\externalbibliography{yes}
\bibliography{mybibfile}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% optional
%\sample availability{Samples of the compounds ...... are available from the authors.}

%% for journal Sci
%\review reports{\\
%Reviewer 1 comments and authors’ response\\
%Reviewer 2 comments and authors’ response\\
%Reviewer 3 comments and authors’ response
%}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}
