Commit 42adbcae by Fakher F. Assaad

Updated Doc to Doc ALF-1.0

1 parent 78c00b35
No preview for this file type
\begin{thebibliography}{10}
\bibitem{Blankenbecler81}
R. Blankenbecler, D.~J. Scalapino, and R.~L. Sugar, Phys. Rev. D {\bf 24},
2278 (1981).
\bibitem{White89}
S. White, D. Scalapino, R. Sugar, E. Loh, J. Gubernatis, and R. Scalettar,
Phys. Rev. B {\bf 40}, 506 (1989).
\bibitem{Sugiyama86}
G. Sugiyama and S. Koonin, Annals of Physics {\bf 168}, 1 (1986).
\bibitem{Sorella89}
S. Sorella, S. Baroni, R. Car, and M. Parrinello, EPL (Europhysics Letters)
{\bf 8}, 663 (1989).
\bibitem{Duane87}
S. Duane, A.~D. Kennedy, B.~J. Pendleton, and D. Roweth, Phys. Lett. {\bf
B195}, 216 (1987).
\bibitem{Assaad08_rev}
F. Assaad and H. Evertz, in {\em Computational Many-Particle Physics},
Vol.~739 of {\em Lecture Notes in Physics}, edited by H. Fehske, R.
Schneider, and A. Wei{\ss}e (Springer, Berlin Heidelberg, 2008), pp.\
277--356.
\bibitem{Scalapino07}
D.~J. Scalapino, in {\em Handbook of High-Temperature Superconductivity:
Theory and Experiment}, edited by J.~R. Schrieffer and J.~S. Brooks (Springer
New York, New York, NY, 2007), pp.\ 495--526.
\bibitem{LeBlanc15}
J.~P.~F. LeBlanc, A.~E. Antipov, F. Becca, I.~W. Bulik, G.~K.-L. Chan, C.-M.
Chung, Y. Deng, M. Ferrero, T.~M. Henderson, C.~A. Jim\'enez-Hoyos, E. Kozik,
X.-W. Liu, A.~J. Millis, N.~V. Prokof'ev, M. Qin, G.~E. Scuseria, H. Shi,
B.~V. Svistunov, L.~F. Tocchio, I.~S. Tupitsyn, S.~R. White, S. Zhang, B.-X.
Zheng, Z. Zhu, and E. Gull, Phys. Rev. X {\bf 5}, 041041 (2015).
\bibitem{Hohenadler10}
M. Hohenadler, T.~C. Lang, and F.~F. Assaad, Phys. Rev. Lett. {\bf 106},
100403 (2011).
\bibitem{Zheng11}
D. Zheng, G.-M. Zhang, and C. Wu, Phys. Rev. B {\bf 84}, 205121 (2011).
\bibitem{Assaad13}
F.~F. Assaad and I.~F. Herbut, Phys. Rev. X {\bf 3}, 031010 (2013).
\bibitem{Toldin14}
F. Parisen~Toldin, M. Hohenadler, F.~F. Assaad, and I.~F. Herbut, Phys. Rev. B
{\bf 91}, 165108 (2015).
\bibitem{Otsuka16}
Y. Otsuka, S. Yunoki, and S. Sorella, Phys. Rev. X {\bf 6}, 011029 (2016).
\bibitem{Chandrasekharan13}
S. Chandrasekharan and A. Li, Phys. Rev. D {\bf 88}, 021701 (2013).
\bibitem{Chandrasekharan15}
V. Ayyar and S. Chandrasekharan, Phys. Rev. D {\bf 91}, 065035 (2015).
\bibitem{Li15a}
Z.-X. {Li}, Y.-F. {Jiang}, S.-K. {Jian}, and H. {Yao}, ArXiv:1512.07908
(2015).
\bibitem{Assaad16}
F.~F. Assaad and T. Grover, Phys. Rev. X {\bf 6}, 041049 (2016).
\bibitem{Assaad99a}
F.~F. Assaad, Phys. Rev. Lett. {\bf 83}, 796 (1999).
\bibitem{Capponi00}
S. Capponi and F.~F. Assaad, Phys. Rev. B {\bf 63}, 155114 (2001).
\bibitem{Schattner15}
Y. Schattner, S. Lederer, S.~A. Kivelson, and E. Berg, Phys. Rev. X {\bf 6},
031028 (2016).
\bibitem{Xu16b}
X.~Y. {Xu}, K. {Sun}, Y. {Schattner}, E. {Berg}, and Z.~Y. {Meng}, ArXiv
e-prints (2016).
\bibitem{Berg12}
E. Berg, M.~A. Metlitski, and S. Sachdev, Science {\bf 338}, 1606 (2012).
\bibitem{Tang14_1}
H.-K. Tang, X. Yang, J. Sun, and H.-Q. Lin, Europhys. Lett. {\bf 107}, 40003
(2014).
\bibitem{Assaad04}
F.~F. Assaad, Phys. Rev. B {\bf 71}, 075103 (2005).
\bibitem{Lang13}
T.~C. Lang, Z.~Y. Meng, A. Muramatsu, S. Wessel, and F.~F. Assaad, Phys. Rev.
Lett. {\bf 111}, 066401 (2013).
\bibitem{Hohenadler14}
M. Hohenadler, F. Parisen~Toldin, I.~F. Herbut, and F.~F. Assaad, Phys. Rev. B
{\bf 90}, 085146 (2014).
\bibitem{Tang15}
H.-K. Tang, E. Laksono, J.~N.~B. Rodrigues, P. Sengupta, F.~F. Assaad, and S.
Adam, Phys. Rev. Lett. {\bf 115}, 186602 (2015).
\bibitem{Rigol03}
M. Rigol, A. Muramatsu, G.~G. Batrouni, and R.~T. Scalettar, Phys. Rev. Lett.
{\bf 91}, 130403 (2003).
\bibitem{Lee09}
D. Lee, Progress in Particle and Nuclear Physics {\bf 63}, 117 (2009).
\bibitem{Grover13}
T. Grover, Phys. Rev. Lett. {\bf 111}, 130402 (2013).
\bibitem{Broecker14}
P. Broecker and S. Trebst, Journal of Statistical Mechanics: Theory and
Experiment {\bf 2014}, P08015 (2014).
\bibitem{Assaad13a}
F.~F. Assaad, T.~C. Lang, and F. Parisen~Toldin, Phys. Rev. B {\bf 89}, 125121
(2014).
\bibitem{Assaad15}
F.~F. Assaad, Phys. Rev. B {\bf 91}, 125146 (2015).
\bibitem{Wu04}
C. Wu and S.-C. Zhang, Phys. Rev. B {\bf 71}, 155115 (2005).
\bibitem{Huffman14}
E.~F. Huffman and S. Chandrasekharan, Phys. Rev. B {\bf 89}, 111101 (2014).
\bibitem{Yao14a}
Z.-X. Li, Y.-F. Jiang, and H. Yao, Phys. Rev. B {\bf 91}, 241117 (2015).
\bibitem{Wei16}
Z.~C. Wei, C. Wu, Y. Li, S. Zhang, and T. Xiang, Phys. Rev. Lett. {\bf 116},
250601 (2016).
\bibitem{Hubbard59}
J. Hubbard, Phys. Rev. Lett. {\bf 3}, 77 (1959).
\bibitem{Troyer05}
M. Troyer and U.-J. Wiese, Phys. Rev. Lett. {\bf 94}, 170201 (2005).
\bibitem{Duane85}
S. Duane and J.~B. Kogut, Phys. Rev. Lett. {\bf 55}, 2774 (1985).
\bibitem{Hirsch83}
J. Hirsch, Phys. Rev. B {\bf 28}, 4059 (1983).
\bibitem{Sokal89}
A.~D. Sokal, {M}onte {C}arlo Methods in Statistical Mechanics: Foundations and
New Algorithms, 1989, lecture notes from Cours de Troisi\`eme Cycle de la
Physique en Suisse Romande. Updated in 1996 for the Carg\`ese Summer School
on ``Functional Integration: Basics and Applications''.
\bibitem{Evertz93}
H.~G. Evertz, G. Lana, and M. Marcu, Phys. Rev. Lett. {\bf 70}, 875 (1993).
\bibitem{Sandvik99b}
A.~W. Sandvik, Phys. Rev. B {\bf 59}, R14157 (1999).
\bibitem{Sandvik02}
O. Sylju\aa{}sen and A. Sandvik, Phys. Rev. E {\bf 66}, 046701 (2002).
\bibitem{HirschFye86}
J.~E. Hirsch and R.~M. Fye, Phys. Rev. Lett. {\bf 56}, 2521 (1986).
\bibitem{Gull_rev}
E. Gull, A.~J. Millis, A.~I. Lichtenstein, A.~N. Rubtsov, M. Troyer, and P.
Werner, Rev. Mod. Phys. {\bf 83}, 349 (2011).
\bibitem{Assaad14_rev}
F.~F. Assaad, in {\em DMFT at 25: Infinite Dimensions: Lecture Notes of the
Autumn School on Correlated Electrons}, edited by E. Pavarini, E. Koch, D.
Vollhardt, and A. Lichtenstein (Verlag des Forschungszentrum J{\"u}lich,
J{\"u}lich, 2014), Vol.~4, Chap.~7. Continuous-time QMC Solvers for
Electronic Systems in Fermionic and Bosonic Baths, iSBN 978-3-89336-953-9.
\bibitem{Assaad07}
F.~F. Assaad and T.~C. Lang, Phys. Rev. B {\bf 76}, 035116 (2007).
\bibitem{Scalettar86}
R.~T. Scalettar, D.~J. Scalapino, and R.~L. Sugar, Phys. Rev. B {\bf 34}, 7911
(1986).
\bibitem{Durr08}
S. D{\"u}rr, Z. Fodor, J. Frison, C. Hoelbling, R. Hoffmann, S.~D. Katz, S.
Krieg, T. Kurth, L. Lellouch, T. Lippert, K.~K. Szabo, and G. Vulvert,
Science {\bf 322}, 1224 (2008).
\bibitem{Assaad02}
F.~F. Assaad, in {\em Lecture notes of the Winter School on Quantum
Simulations of Complex Many-Body Systems: From Theory to Algorithms.}, edited
by J. Grotendorst, D. Marx, and A. Muramatsu. (Publication Series of the John
von Neumann Institute for Computing, J\"ulich, 2002), Vol.~10, pp.\ 99--155.
\bibitem{Motome97}
Y. Motome and M. Imada, Journal of the Physical Society of Japan {\bf 66},
1872 (1997).
\bibitem{Assaad97}
F.~F. Assaad, M. Imada, and D.~J. Scalapino, Phys. Rev. B {\bf 56}, 15001
(1997).
\bibitem{Fye86}
R.~M. Fye, Phys. Rev. B {\bf 33}, 6271 (1986).
\bibitem{Iazzi15}
M. Iazzi and M. Troyer, Phys. Rev. B {\bf 91}, 241118 (2015).
\bibitem{Rombouts99}
S.~M.~A. Rombouts, K. Heyde, and N. Jachowicz, Phys. Rev. Lett. {\bf 82}, 4155
(1999).
\bibitem{Gull08}
E. Gull, P. Werner, O. Parcollet, and M. Troyer, EPL (Europhysics Letters) {\bf
82}, 57003 (2008).
\bibitem{Rombouts98}
S. Rombouts, K. Heyde, and N. Jachowicz, Physics Letters A {\bf 242}, 271
(1998).
\bibitem{Rost12}
D. Rost, E.~V. Gorelik, F. Assaad, and N. Bl\"umer, Phys. Rev. B {\bf 86},
155109 (2012).
\bibitem{Rost13}
D. Rost, F. Assaad, and N. Bl\"umer, Phys. Rev. E {\bf 87}, 053305 (2013).
\bibitem{Bluemer08}
N. {Bl{\"u}mer}, ArXiv e-prints (2008).
\bibitem{Bai2011}
Z. Bai, C. Lee, R.-C. Li, and S. Xu, Linear Algebra and its Applications {\bf
435}, 659 (2011).
\bibitem{vanderSluis1969}
A. van~der Sluis, Numerische Mathematik {\bf 14}, 14 (1969).
\bibitem{Bercx17}
M. Bercx, J.~S. Hofmann, F.~F. Assaad, and T.~C. Lang, Phys. Rev. B {\bf 95},
035108 (2017).
\bibitem{Milat04}
I. Milat, F. Assaad, and M. Sigrist, Eur. Phys. J. B {\bf 38}, 571 (2004),
http://xxx.lanl.gov/cond-mat/0312450.
\bibitem{Bercx09}
M. Bercx, T.~C. Lang, and F.~F. Assaad, Phys. Rev. B {\bf 80}, 045412 (2009).
\bibitem{Xu16}
X.~Y. {Xu}, K.~S.~D. {Beach}, K. {Sun}, F.~F. {Assaad}, and Z.~Y. {Meng},
ArXiv:1602.07150 (2016).
\bibitem{Beach04}
K.~S.~D. Beach, P.~A. Lee, and P. Monthoux, Phys. Rev. Lett. {\bf 92}, 026401
(2004).
\bibitem{Li15}
Z.-X. Li, Y.-F. Jiang, and H. Yao, New Journal of Physics {\bf 17}, 085003
(2015).
\bibitem{Assaad14}
F.~F. Assaad, Nat Phys {\bf 10}, 905 (2014).
\bibitem{Xu16a}
X.~Y. Xu, Y. Qi, J. Liu, L. Fu, and Z.~Y. Meng, arXiv:1612.03804 (2016).
\bibitem{Ulybyshev2013}
M.~V. Ulybyshev, P.~V. Buividovich, M.~I. Katsnelson, and M.~I. Polikarpov,
Phys. Rev. Lett. {\bf 111}, 056801 (2013).
\bibitem{Brower12}
R. Brower, C. Rebbi, and D. Schaich, PoS(Lattice 2011)056 (arXiv:1204.5424) .
\bibitem{Jureca16}
{J\"ulich Supercomputing Centre}, Journal of large-scale research facilities
{\bf 2}, A62 (2016).
\end{thebibliography}
No preview for this file type
......@@ -4,6 +4,7 @@
% under a Creative Commons Attribution-ShareAlike 4.0 International License.
% For the licensing details of the documentation see license.CCBYSA.
\documentclass[10pt,Arial]{scrartcl}
\usepackage{graphicx}
\usepackage[margin=2.5cm]{geometry}
......@@ -27,8 +28,19 @@
\usepackage{url}
\usepackage{booktabs}
\usepackage{hyperref}
\lstset{language=Fortran,
\usepackage{float}
\makeatletter
\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
{-2.5ex\@plus -1ex \@minus -.25ex}%
{1.25ex \@plus .25ex}%
{\normalfont\normalsize\bfseries}}
\makeatother
\setcounter{secnumdepth}{4}
\setcounter{tocdepth}{4}
\lstdefinestyle{fortran}{
language=Fortran,
basicstyle=\ttfamily,
keywordstyle=\color{red},
commentstyle=\color{blue},
......@@ -39,8 +51,8 @@
columns=flexible
}
\lstdefinestyle{custombash}{
language=bash,
\lstdefinestyle{bash}{
language=bash,
basicstyle=\ttfamily,
keywordstyle=\color{red},
commentstyle=\color{blue},
......@@ -55,22 +67,27 @@ language=bash,
\def\Trf{\mathop{\mathrm{Tr}_{\mathrm{F}}}}
\makesavenoteenv{tabular}
\makesavenoteenv{table}
% % only for the scrartcl class:
\setkomafont{author}{\large}
\setkomafont{date}{\large}
% \RedeclareSectionCommand[style=section,indent=0pt]{part}
% \renewcommand*\partformat{\thepart\autodot\enskip}
\newcommand{\mycomment}[1]{{\color{blue} #1}}
\newcommand{\mycomment}[1]{{\color{red} #1}}
\newcommand{\FAcomment}[1]{{\color{red} #1}}
\makeindex
\begin{document}
%---------------------------------------------------------------------------------------------------------
\title{The \textit{ALF} (\textit{A}lgorithms for \textit{L}attice \textit{F}ermions) project release 0.5}
\title{The \emph{ALF} (\emph{A}lgorithms for \emph{L}attice \emph{F}ermions) project release 1.0}
\subtitle{Documentation for the auxiliary field quantum Monte Carlo code.}
\author{Martin Bercx, Florian Goth, Johannes S. Hofmann, Fakher F. Assaad }
%---------------------------------------------------------------------------------------------------------
\maketitle
Copyright \textcopyright ~2016, The \textit{ALF} Project.\\
Copyright \textcopyright ~2016, 2017 The \textit{ALF} Project.\\
This is the ALF Project Documentation by the ALF contributors.
It is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
You are free to share and benefit from this documentation as long as this license is preserved
......@@ -78,124 +95,30 @@ and proper attribution to the authors is given. For details see the ALF project
homepage \url{alf.physik.uni-wuerzburg.de}.
\tableofcontents
\clearpage
\section{Introduction}\label{sec:intro}
\input{intro}
\section{Auxiliary Field Quantum Monte Carlo}\label{sec:def}
\input{model}
\input{updating}
\input{stabilization}
\input{sampling}
\section{Data Structures and Input/Output}\label{sec:imp}
\input{implementation}
\input{files}
\input{walkthrough_hubbard}
\input{other_models}
\input{analysis}
\input{running}
\section{Examples}\label{sec:ex}
\input{walkthrough_hubbard}
\section{Miscellaneous}\label{sec:misc}
\input{other_models}
\input{performance}
\section{Conclusions and Future Directions}\label{sec:con}
\input{conclusion}
\addcontentsline{toc}{section}{Acknowledgments}
\input{acknowledgment}
\addcontentsline{toc}{section}{References}
%\bibliographystyle{./prXsty}
%\bibliography{./fassaad}
\begin{thebibliography}{10}
\bibitem{Blankenbecler81}
R. Blankenbecler, D.~J. Scalapino, and R.~L. Sugar, Phys. Rev. D {\bf 24},
2278 (1981).
\bibitem{Assaad08_rev}
F. Assaad and H. Evertz, in {\em Computational Many-Particle Physics},
Vol.~739 of {\em Lecture Notes in Physics}, edited by H. Fehske, R.
Schneider, and A. Wei{\ss}e (Springer, Berlin Heidelberg, 2008), pp.\
277--356.
\bibitem{Wu04}
C. Wu and S.-C. Zhang, Phys. Rev. B {\bf 71}, 155115 (2005).
\bibitem{Wei16}
Z.~C. Wei, C. Wu, Y. Li, S. Zhang, and T. Xiang, Phys. Rev. Lett. {\bf 116},
250601 (2016).
\bibitem{White89}
S. White, D. Scalapino, R. Sugar, E. Loh, J. Gubernatis, and R. Scalettar,
Phys. Rev. B {\bf 40}, 506 (1989).
\bibitem{Sugiyama86}
G. Sugiyama and S. Koonin, Annals of Physics {\bf 168}, 1 (1986).
\bibitem{Sorella89}
S. Sorella, S. Baroni, R. Car, and M. Parrinello, EPL (Europhysics Letters)
{\bf 8}, 663 (1989).
\bibitem{Milat04}
I. Milat, F. Assaad, and M. Sigrist, Eur. Phys. J. B {\bf 38}, 571 (2004),
http://xxx.lanl.gov/cond-mat/0312450.
\bibitem{Bercx09}
M. Bercx, T.~C. Lang, and F.~F. Assaad, Phys. Rev. B {\bf 80}, 045412 (2009).
\bibitem{Schattner15}
Y. Schattner, S. Lederer, S.~A. Kivelson, and E. Berg, Phys. Rev. X {\bf 6},
031028 (2016).
\bibitem{Xu16}
X.~Y. {Xu}, K.~S.~D. {Beach}, K. {Sun}, F.~F. {Assaad}, and Z.~Y. {Meng},
ArXiv:1602.07150 (2016).
\bibitem{Assaad16}
F.~F. Assaad and T. Grover, Phys. Rev. X {\bf 6}, 041049 (2016).
\bibitem{Assaad99a}
F.~F. Assaad, Phys. Rev. Lett. {\bf 83}, 796 (1999).
\bibitem{Capponi00}
S. Capponi and F.~F. Assaad, Phys. Rev. B {\bf 63}, 155114 (2001).
\bibitem{Beach04}
K.~S.~D. Beach, P.~A. Lee, and P. Monthoux, Phys. Rev. Lett. {\bf 92}, 026401
(2004).
\bibitem{Assaad04}
F.~F. Assaad, Phys. Rev. B {\bf 71}, 075103 (2005).
\bibitem{Lang13}
T.~C. Lang, Z.~Y. Meng, A. Muramatsu, S. Wessel, and F.~F. Assaad, Phys. Rev.
Lett. {\bf 111}, 066401 (2013).
\bibitem{Li15}
Z.-X. Li, Y.-F. Jiang, and H. Yao, New Journal of Physics {\bf 17}, 085003
(2015).
\bibitem{Broecker14}
P. Broecker and S. Trebst, Journal of Statistical Mechanics: Theory and
Experiment {\bf 2014}, P08015 (2014).
\bibitem{Assaad14}
F.~F. Assaad, Nat Phys {\bf 10}, 905 (2014).
\bibitem{Assaad13a}
F.~F. Assaad, T.~C. Lang, and F. Parisen~Toldin, Phys. Rev. B {\bf 89}, 125121
(2014).
\bibitem{Assaad15}
F.~F. Assaad, Phys. Rev. B {\bf 91}, 125146 (2015).
\bibitem{Xu16a}
X.~Y. Xu, Y. Qi, J. Liu, L. Fu, and Z.~Y. Meng, arXiv:1612.03804 (2016).
\bibitem{Hohenadler14}
M. Hohenadler, F. Parisen~Toldin, I.~F. Herbut, and F.~F. Assaad, Phys. Rev. B
{\bf 90}, 085146 (2014).
\bibitem{Ulybyshev2013}
M.~V. Ulybyshev, P.~V. Buividovich, M.~I. Katsnelson, and M.~I. Polikarpov,
Phys. Rev. Lett. {\bf 111}, 056801 (2013).
\bibitem{Brower12}
R. Brower, C. Rebbi, and D. Schaich, PoS(Lattice 2011)056 (arXiv:1204.5424) .
\bibitem{Jureca16}
{J\"ulich Supercomputing Centre}, Journal of large-scale research facilities
{\bf 2}, A62 (2016).
\end{thebibliography}
\bibliographystyle{./prXsty}
\bibliography{./fassaad}
\addcontentsline{toc}{section}{License}
\input{license}
......
#Node real time speedup
1 156.789 28
2 79.2239 55.4137324721454
4 39.7976 110.31047098317485
8 19.93456 220.22517677841896
16 9.9785 439.95510347246574
32 5.028 873.1288782816229
64 2.506 1751.832402234637
No preview for this file type
No preview for this file type
#OMP real-time speedup (gepinned)
1 41487.85 1
2 24350.843 1.703754157504937
4 14902.294 2.7839908406048086
7 8567.924 4.842228992694146
14 4244.074 9.775477524661445
28 3827.053 10.84067819285492
No preview for this file type
No preview for this file type
#L Realtime per bin DOF
4 46.5952 256
5 157.7983 400
6 517.6544 576
7 1350.915 784
8 3214.6523 1024
9 5929.47 1296
No preview for this file type
No preview for this file type
......@@ -3,12 +3,13 @@
% The ALF project documentation by the ALF contributors is licensed
% under a Creative Commons Attribution-ShareAlike 4.0 International License.
% For the licensing details of the documentation see license.CCBYSA.
% !TEX root = ALF-05.tex
% !TEX root = Doc.tex
%-------------------------------------------------------------------------------------
\section*{Acknowledgments}
%-------------------------------------------------------------------------------------
We are very grateful to S. Beyl, M. Hohenadler, F. Parisen Toldin, M. Raczkowski, J. Schwab, T. Sato, Z. Wang and M. Weber, for constant support during the development of this project. FFA would also like to thank T. Lang and Z.Y. Meng for developments of the auxiliary field code as well as T. Grover.
MB thanks the the Bavarian Competence Network for Technical and Scientific High Performance Computing (KONWIHR) for financial support. FG and JH thank the SFB-1170 for financial support under projects Z03 and C01. FFA thanks the DFG-funded FOR1807 and FOR1346 for partial financial support.
Part of the optimization of the code was carried during the Porting and Tuning Workshop 2016 offered by the Forschungszentrum J\"ulich.
Calculations to extensively test this package were carried out on SuperMUC at the Leibniz Supercomputing Centre and on JURECA \cite{Jureca16} at the J\"ulich Supercomputing Centre (JSC). We thank those institutions for generous computer allocations.
We are very grateful to S. Beyl, M. Hohenadler, F. Parisen Toldin, M. Raczkowski, J. Schwab, T. Sato, Z. Wang and M. Weber, for constant support during the development of this project. FFA would also like to thank T.~Lang and Z.~Y.~Meng for developments of the auxiliary field code as well as T.~Grover.
MB thanks the Bavarian Competence Network for Technical and Scientific High Performance Computing (KONWIHR) for financial support. FG and JH thank the SFB-1170 for financial support under projects Z03 and C01. FFA thanks the DFG-funded FOR1807 and FOR1346 for partial financial support.
Part of the optimization of the code was carried out during the Porting and Tuning Workshop 2016 offered by the Forschungszentrum J\"ulich.
Calculations to extensively test this package were carried out both on SuperMUC at the Leibniz Supercomputing Centre and on JURECA \cite{Jureca16} at the J\"ulich Supercomputing Centre. We thank both institutions for generous allocation of computing time.
%The authors gratefully acknowledge the computing time granted by the John von Neumann Institute for Computing (NIC) and provided on the supercomputer JURECA \cite{Jureca16} at Jülich Supercomputing Centre (JSC). The authors gratefully acknowledge the Gauss Centre for Supercomputing e.V. (www.gauss-centre.eu) for funding this project by providing computing time on the GCS Supercomputer SuperMUC at the Leibniz Supercomputing Centre (LRZ, www.lrz.de).
\ No newline at end of file
......@@ -6,36 +6,38 @@
% !TEX root = Doc.tex
%-------------------------------------------------------------------------------------
\section{ Analysis programs }\label{sec:analysis}
\subsection{ Analysis programs }\label{sec:analysis}
%-------------------------------------------------------------------------------------
%
\begin{table}[h]
\begin{tabular}{@{} l l @{}}\toprule
Program & Description \\\midrule
\texttt{cov\_scal.f90} & In combination with the script \texttt{analysis.sh}, the bin files with suffix \texttt{\_scal} are read in, \\
& and corresponding file with suffix \texttt{\_scalJ} are produced. They contain the result of the \\
& Jackknife resampling. \\
& and the corresponding files with suffix \texttt{\_scalJ} are produced. They contain the result \\
& of the Jackknife rebinning analysis (see Sec.~\ref{sec:sampling}). \\
\texttt{cov\_eq.f90} & In combination with the script \texttt{analysis.sh}, the bin files with suffix \texttt{\_eq} are read in, \\
& and corresponding files will suffix \texttt{\_eqJR} and \texttt{\_eqJK} are produced. They correspond to\\
& correlation functions in real and Fourier space, respectively. \\
& and the corresponding files will suffix \texttt{\_eqJR} and \texttt{\_eqJK} are produced. They correspond \\
& to correlation functions in real and Fourier space, respectively. \\
\texttt{cov\_tau.f90} & In combination with the script \texttt{analysis.sh}, the bin files \texttt{X\_tau} are read in, \\
& and the directories \texttt{X\_kx\_ky} are produced for all \texttt{kx} and \texttt{ky} greater or equal to zero. \\
& Here \texttt{X} is a place holder from \texttt{Green}, \texttt{SpinXY}, etc as specified in \texttt{ Alloc\_obs(Ltau)} \\
& (See section \ref{Alloc_obs_sec}). Each directory contains a file \texttt{g\_kx\_ky} containing the \\
& time displaced correlation function traced over the orbitals. It also contains the \\
& covariance matrix if \texttt{N\_cov} is set to unity in the parameter file listed in Sec.~\ref{sec:input}. \\
& covariance matrix if \texttt{N\_cov} is set to unity in the parameter file (see Sec.~\ref{sec:input}). \\
& Equally, a directory \texttt{X\_R0} for the local time displaced correlation function is generated. \\\bottomrule
\end{tabular}
\caption{ Overview of analysis programs that are called within the script \texttt{analysis.sh}. \label{table:analysis_programs}}
\end{table}
%
Here we briefly discuss the analysis programs which read in bins and carry out the error analysis.
Error analysis is based on the central limit theorem, which required bins to be statistically independent, and also the existence of a well-defined variance of the distribution.
The former will be the case if bins are longer than the auto-correlation time. The latter has to be checked by the user, since in general the distribution variance depends on the model and on the observable.
In the parameter file listed in Sec.~\ref{sec:input}, the user can specify the how many initial bins should be omitted (variable \texttt{n\_skip}).
This number should be comparable to the auto-correlation time.
The re-binning variable \texttt{N\_rebin} will merge \texttt{N\_rebin} bins into a single one. If the autocorrelation time is smaller than the effective bin size, then the error should be independent on the bin size and thereby on the variable \texttt{N\_rebin}. Our analysis is based on the Jackknife resampling. As listed in Table, \ref{table:analysis_programs} we provide three programs to account for the three observable types. The programs can be found in the directory \texttt{Analysis} and are executed by running the bash shell script
\texttt{analysis.sh}
Here we briefly discuss the analysis programs which read in bins and carry out the error analysis. (See Sec.~\ref{sec:sampling} for a more detailed discussion.)
Error analysis is based on the central limit theorem, which requires bins to be statistically independent, and also the existence of a well-defined variance for the observable under consideration.
The former will be the case if bins are longer than the autocorrelation time. The latter has to be checked by the user. In the parameter file listed in Sec.~\ref{sec:input}, the user can specify how many initial bins should be omitted (variable \texttt{n\_skip}).
This number should be comparable to the autocorrelation time.
The rebinning variable \texttt{N\_rebin} will merge \texttt{N\_rebin} bins into a single new bin.
If the autocorrelation time is smaller than the effective bin size, the error should become independent of the bin size and thereby of the variable \texttt{N\_rebin}.
Our analysis is based on the Jackknife resampling.
As listed in Table \ref{table:analysis_programs} we provide three analysis programs to account for the three observable types. The programs can be found in the directory \texttt{Analysis} and are executed by running the bash shell script
\texttt{analysis.sh}.
%
\begin{table}[h]
\begin{tabular}{@{} l l @{}}\toprule
......@@ -55,7 +57,7 @@ The re-binning variable \texttt{N\_rebin} will merge \texttt{N\_rebin} bins i
& The suffixes \texttt{R} and \texttt{K} refers to real and reciprocal space, respectively.\\
\texttt{Y\_R0/g\_R0} & Time-resolved and spatially local Jackknife mean and error of \texttt{Y},\\
& where \texttt{Y} stands for \texttt{Green, SpinZ, SpinXY}, and \texttt{Den}.\\
\texttt{Y\_kx\_ky/g\_kx\_ky} & Time-resolved and $\vec{k}$-dependent Jackknife mean and error of \texttt{Y},\\
\texttt{Y\_kx\_ky/g\_kx\_ky} & Time resolved and $\vec{k}$-dependent Jackknife mean and error of \texttt{Y},\\
& where \texttt{Y} stands for \texttt{Green, SpinZ, SpinXY}, and \texttt{Den}.\\\bottomrule
\end{tabular}
\caption{ Standard output files of the error analysis. \label{table:analysis_output}}
......@@ -72,7 +74,7 @@ OBS : 1 <mean(X)> <error(X)>
OBS : 2 <mean(sign)> <error(sign)>
\end{alltt}
\item For the equal-time correlation functions \texttt{Y}, the formatting of the output files \texttt{Y\_eqJR} and \texttt{Y\_eqJK} follows this structure:
\item For the equal time correlation functions \texttt{Y}, the formatting of the output files \texttt{Y\_eqJR} and \texttt{Y\_eqJK} follows this structure:
\begin{alltt}
do i = 1, N_unit_cell
<k_x(i)> <k_y(i)>
......@@ -85,7 +87,7 @@ enddo
\end{alltt}
where \texttt{Re} and \texttt{Im} refer to the real and imaginary part, respectively.
\item The time-displaced correlation functions \texttt{Y} are written to the output files \texttt{Y\_R0/g\_R0}, when measured locally in space,
\item The imaginary-time displaced correlation functions \texttt{Y} are written to the output files \texttt{Y\_R0/g\_R0}, when measured locally in space,
and to the output files \texttt{Y\_kx\_ky/g\_kx\_ky} when they are measured $\vec{k}$-resolved.
Both output files have the following formatting:
\begin{alltt}
......
......@@ -4,8 +4,4 @@
% under a Creative Commons Attribution-ShareAlike 4.0 International License.
% For the licensing details of the documentation see license.CCBYSA.
%-------------------------------------------------------------------------------------
\section{Conclusions and future directions}
%-------------------------------------------------------------------------------------
In its present form, the ALF-project allows to simulate a very large class of non-trivial models efficiently and at a minimal programming cost. There are many possible extensions which deserve to be considered in future releases. The Hamiltonians we presently defining are imaginary time independent. This however, can be easily generalized to time dependent Hamiltonians thus allowing, for example, to access entanglement properties of interacting fermionic systems \cite{Broecker14,Assaad14,Assaad13a,Assaad15}. Generalizations to include global moves are equally desirable. This is a prerequisite to play with recent ideas of self-learning algorithms \cite{Xu16a} so as to possibly avoid critical slowing down. At present we are restricted to discrete fields such that implementations of the long range Coulomb repulsion as introduced in \cite{Hohenadler14,Ulybyshev2013,Brower12} is not included in the package. Extensions to continuous fields are certainly possible, but require an efficient upgrading scheme. Finally, a ground state projective formulation is equally desirable.
In its present form, the auxiliary field QMC code of the ALF project allows to simulate a large class of non-trivial models, both efficiently and at minimal programming cost. There are many possible extensions which deserve to be considered in future releases. The model Hamiltonians we have presented so far are imaginary-time independent. This however can be easily generalized to imaginary-time dependent model Hamiltonians thus allowing, for example, to access entanglement properties of interacting fermionic systems \cite{Broecker14,Assaad14,Assaad13a,Assaad15}. Generalizations to include global moves are equally desirable. This is a prerequisite to play with recent ideas of self-learning algorithms \cite{Xu16a} so as to possibly avoid the issue of critical slowing down. At present, the QMC code of this package is restricted to discrete HS fields such that implementations of the long-range Coulomb repulsion -- as introduced in \cite{Hohenadler14,Ulybyshev2013,Brower12} -- are not yet included. Extensions to continuous HS fields are certainly possible, but require an efficient upgrading scheme. Finally, an implementation of the ground state projective QMC method is equally desirable.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
......@@ -12,7 +12,7 @@
The ALF code is provided as an open source software such that it is available to all and we hope that it
will be useful. If you benefit from this code we ask that you acknowledge the ALF collaboration as mentioned on our
homepage \url{alf.physik.uni-wuerzburg.de}. The git repository at \url{alf.physik.uni-wuerzburg.de} gives us the tools to
create a small but vibrant community around the code and provides a suitable entrypoint for future contributors and future developments.
create a small but vibrant community around the code and provides a suitable entry point for future contributors and future developments.
The homepage is also the place where the original source files can be found.
With the coming public release it was necessary to add copyright headers to our source files.
%and to think about the
......@@ -25,14 +25,14 @@ license your changes under the same license. The details are in the file license
The source code itself is licensed under a GPL license to keep the source as well as any future work in the community.
To express our desire for a proper attribution we decided to make this a visible part of the license.
To that end we have exercised the rights of section 7 of GPL version 3 and have amended
the license terms with an additional paragraph that expresses our wish that if an author has benfitted from this code
the license terms with an additional paragraph that expresses our wish that if an author has benefitted from this code
that he/she should consider giving back a citation as specified on \url{alf.physik.uni-wuerzburg.de}.
This is not something that is meant to restrict your freedom of use, but something that we strongly expect to be good scientific conduct.
The original GPL license can be found in the file license.GPL and the additional terms can be found in license.additional.
In favour to our users, \textit{ALF} contains part of the lapack implementation version 3.6.1 from \url{http://www.netlib.org/lapack}.
In favour to our users, the ALF code contains part of the lapack implementation version 3.6.1 from \url{http://www.netlib.org/lapack}.
Lapack is licensed under the modified BSD license whose full text can be found in license.BSD.\\
With that being said, we hope that ALF will prove to you to be a suitable and highly performant tool that enables
you to perform Monte Carlo studies of solid state models of unprecedented complexity.\\
With that being said, we hope that the ALF code will prove to you to be a suitable and high-performance tool that enables
you to perform quantum Monte Carlo studies of solid state models of unprecedented complexity.\\
\\
The ALF project's contributors.\\
......
......@@ -6,25 +6,27 @@
% !TEX root = Doc.tex
%-------------------------------------------------------------------------------------
\section{Other models}
\subsection{Other models}
%-------------------------------------------------------------------------------------
\label{sec:other_models}
The aim of this section is to briefly mention a small selection of other models that can be simulated within the ALF-project.
The aim of this section is to briefly mention a small selection of other models that can be studied using the QMC code of the ALF project.
%-------------------------------------------------------------------------------------
\subsection{The Kondo lattice}
\subsubsection{Kondo lattice model}
%-------------------------------------------------------------------------------------
Simulating the Kondo lattice within the ALF-project requires rewriting of the model along the lines of Refs.~\cite{Assaad99a,Capponi00,Beach04}. Adopting the notation of these articles, the Hamiltonian that one will simulate reads:
\begin{equation}
Simulating the Kondo lattice with the QMC code of the ALF project requires rewriting of the model along the lines of Refs.~\cite{Assaad99a,Capponi00,Beach04}.
Adopting the notation of these articles, the Hamiltonian that one will simulate reads:
\begin{equation}\label{eqn:ham_kondo}
\hat{\mathcal{H}} =
\underbrace{-t \sum_{\langle \vec{i},\vec{j} \rangle,\sigma} \left( \hat{c}_{\vec{i},\sigma}^{\dagger} \hat{c}_{\vec{j},\sigma}^{\phantom\dagger} + \text{H.c.} \right) }_{\equiv \hat{\mathcal{H}}_t} - \frac{J}{4}
\sum_{\vec{i}} \left( \sum_{\sigma} \hat{c}_{\vec{i},\sigma}^{\dagger} \hat{f}_{\vec{i},\sigma}^{\phantom\dagger} +
\hat{f}_{\vec{i},\sigma}^{\dagger} \hat{c}_{\vec{i},\sigma}^{\phantom\dagger} \right)^{2} +
\underbrace{\frac{U}{2} \sum_{\vec{i}} \left( \hat{n}^{f}_{\vec{i}} -1 \right)^2}_{\equiv \hat{\mathcal{H}}_U}.
\end{equation}
This from is included in Eq.~\ref{eqn:general_ham_i} such the above Hamiltonian can be implemented in our program package. The relation to the Kondo lattice model follows from expanding the square of the hybridization to obtain:
This form is included in the general Hamiltonian (\ref{eqn:general_ham}) such that the above Hamiltonian can be implemented in our program package.
The relation to the Kondo lattice model follows from expanding the square of the hybridization to obtain:
\begin{equation}
\hat{\mathcal{H}} =\hat{\mathcal{H}}_t
+ J \sum_{\vec{i}} \left( \hat{\vec{S}}^{c}_{\vec{i}} \cdot \hat{\vec{S}}^{f}_{\vec{i}} + \hat{\eta}^{z,c}_{\vec{i}} \cdot \hat{\eta}^{z,f}_{\vec{i}}
......@@ -37,18 +39,19 @@ This from is included in Eq.~\ref{eqn:general_ham_i} such the above Hamilton
\hat{P}^{-1} \hat{c}^{\phantom\dagger}_{\vec{i},\uparrow} \hat{P} = (-1)^{i_x+i_y} \hat{c}^{\dagger}_{\vec{i},\uparrow} \; \text{ and } \;
\hat{P}^{-1} \hat{c}^{\phantom\dagger}_{\vec{i},\downarrow} \hat{P} = \hat{c}^{\phantom\dagger}_{\vec{i},\downarrow}
\end{equation}
Since the $\hat{\eta}^{f} $ and $ \hat{S}^{f} $ operators do not alter the parity [$(-1)^{\hat{n}^{f}_{\vec{i}}}$ ] of the $f$-sites,
Since the $\hat{\eta}^{f} $- and $ \hat{S}^{f} $-operators do not alter the parity [$(-1)^{\hat{n}^{f}_{\vec{i}}}$ ] of the $f$-sites,
\begin{equation}
\left[ \hat{\mathcal{H}}, \hat{\mathcal{H}}_U \right] = 0.
\end{equation}
Thereby, and for positive values of $U$ , doubly occupied or empty $f$-sites corresponding to even parity will be suppressed by a Boltzmann factor
$e^{-\beta U/2} $ is comparison to odd parity ones. Choosing $\beta U $ adequately will essentially allow to restrict the Hilbert space to odd parity $f$-sites. In this Hilbert space $\hat{\eta}^{x,f} = \hat{\eta}^{y,f} = \hat{\eta}^{z,f} =0$ such that the Hamiltonian reduces to the Kondo lattice model.
Thereby, and for positive values of $U$ , doubly occupied or empty $f$-sites -- corresponding to even parity sites -- are suppressed by a Boltzmann factor
$e^{-\beta U/2} $ in comparison to odd parity sites. Choosing $\beta U $ adequately essentially allows to restrict the Hilbert space to odd parity $f$-sites.
In this Hilbert space $\hat{\eta}^{x,f} = \hat{\eta}^{y,f} = \hat{\eta}^{z,f} =0$ such that the Hamiltonian (\ref{eqn:ham_kondo}) reduces to the Kondo lattice model.
%-------------------------------------------------------------------------------------
\subsection{SU(N) Hubbard-Heisenberg models}
\subsubsection{$SU(N)$ Hubbard-Heisenberg models}
%-------------------------------------------------------------------------------------
SU(2N) Hubbard-Heisenberg \cite{Assaad04,Lang13} models can be written as:
$SU(2N)$ Hubbard-Heisenberg \cite{Assaad04,Lang13} models can be written as:
\begin{equation}
\hat{\mathcal{H}} =
\underbrace{ - t \sum_{ \langle \vec{i},\vec{j} \rangle } \left( \vec{\hat{c}}^{\dagger}_{\vec{i}} \vec{\hat{c}}^{\phantom{\dagger}}_{\vec{j}} + \text{H.c.} \right) }_{\equiv \hat{\mathcal{H}}_t} \; \;
......@@ -64,13 +67,17 @@ $ \vec{\hat{c}}^{\dagger}_{\vec{i}} =
(\hat{c}^{\dagger}_{\vec{i},1}, \hat{c}^{\dagger}_{\vec{i},2}, \cdots, \hat{c}^{\dagger}_{\vec{i}, N } ) $ is an
$N$-flavored spinor, and $ \hat{D}_{ \vec{i},\vec{j}} = \vec{\hat{c}}^{\dagger}_{\vec{i}}
\vec{\hat{c}}_{\vec{j}} $.
To use the present package to simulate this model, one will rewrite the $J$-term as a sum of perfect squares,
To use the QMC code of the ALF project to simulate this model, one will rewrite the $J$-term as a sum of perfect squares,
\begin{equation}
\hat{\mathcal{H}}_J = -\frac{J}{4 N} \sum_{ \langle \vec{i}, \vec{j} \rangle }
\left(\hat{D}^{\dagger}_{ \langle \vec{i}, \vec{j} \rangle } + \hat{D}_{ \langle \vec{i}, \vec{j} \rangle } \right)^2 -
\left(\hat{D}^{\dagger}_{ \langle \vec{i}, \vec{j} \rangle } - \hat{D}_{ \langle \vec{i}, \vec{j} \rangle} \right)^2,
\end{equation}
so to manifestly bring it into the form of Eq.~\ref{eqn:general_ham_i}. It is amusing to note that setting the hopping $t=0$, charge fluctuations will be suppressed by the Boltzmann factor $e^{\beta U /N \left( \vec{\hat{c}}^{\dagger}_{\vec{i}} \vec{\hat{c}}^{\phantom\dagger}_{\vec{i}} - {\frac{N}{2} } \right)^2 } $ since in this case
$ \left[ \hat{\mathcal{H}}, \hat{\mathcal{H}}_U \right] = 0 $. This provides a route to use the auxiliary field QMC algorithm to simulate -- free of the sign problem -- SU(2N) Heisenberg models in the self-adjoint antisymmetric representation \footnote{ This corresponds to a Young tableau with single column and $N/2$ rows.}
For odd values of $N$ recent progress in our understanding of the origins of the sign problem \cite{Wei16} will allows us to simulate -- without encountering the sign problem -- a set of non-trivial Hamiltonians \cite{Li15,Assaad16}.
so to manifestly bring it into the form of the general Hamiltonian(\ref{eqn:general_ham}).
It is amusing to note that setting the hopping $t=0$, charge fluctuations will be suppressed by the Boltzmann factor $e^{-\beta U /N \left( \vec{\hat{c}}^{\dagger}_{\vec{i}} \vec{\hat{c}}^{\phantom\dagger}_{\vec{i}} - {\frac{N}{2} } \right)^2 } $
%\mycomment{MB: corrected minus sign in the exponent}
since in this case $ \left[ \hat{\mathcal{H}_J}, \hat{\mathcal{H}}_U \right] = 0 $.
%\mycomment{MB: I suggest to use only the J-term here, $ \left[ \hat{\mathcal{H}}_{J}, \hat{\mathcal{H}}_U \right] = 0 $.}
This provides a route to use the auxiliary field QMC algorithm to simulate -- free of the sign problem -- $SU(2N)$ Heisenberg models in the self-adjoint antisymmetric representation \footnote{ This corresponds to a Young tableau with single column and $N/2$ rows.}
For odd values of $N$ recent progress in our understanding of the origins of the sign problem \cite{Wei16} allows us to simulate a set of non-trivial Hamiltonians \cite{Li15,Assaad16}, without encountering the sign problem.
......@@ -3,10 +3,50 @@
% The ALF project documentation by the ALF contributors is licensed
% under a Creative Commons Attribution-ShareAlike 4.0 International License.
% For the licensing details of the documentation see license.CCBYSA.
% !TEX root = Doc.tex
%-------------------------------------------------------------------------------------
\section{Performance}
\subsection{Performance, memory requirements and parallelization}
%-------------------------------------------------------------------------------------
Next to the entire computational time is spent in BLAS routines such that the performance of the code will depend on the implementation of this library. We have found that the code performs well, and that an efficient OpenMP version can be obtained merely by loading the corresponding BLAS and LAPACK routines.
As mentioned in the introduction, the auxiliary field QMC algorithm scales linearly in inverse temperature $\beta$ and cubic in the volume $N_{\text{dim}}$. Using fast updates, a single spin flip requires $(N_{\text{dim}})^2$ operations to update the Green function upon acceptance. As there are $L_{\text{Trotter}}\times N_{\text{dim}}$ spins to be visited, the total computational cost for one sweep is of the order of $\beta (N_{\text{dim}})^3$. This operation dominates the performance, see Fig.~\ref{fig_scaling_size}. A profiling analysis of our code shows that 80-90\% of the CPU time is spend in ZGEMM calls of the BLAS library provided in the MKL package by Intel. Consequently, the single-core performance is next to optimal.
\begin{figure}[h]
\begin{center}
\includegraphics[scale=.8]{Figures/Size_scaling_ALF_2.pdf}
\end{center}
\caption{\label{fig_scaling_size}Volume scaling behavior of the auxiliary field QMC code of the ALF project on SuperMUC (phase 2/Haswell nodes) at the LRZ in Munich. The number of sites $N_{\text{dim}}$ corresponds to the system volume.
The plot confirms that the leading scaling order is due to matrix multiplications such that the runtime is dominated by calls to ZGEMM. }
\end{figure}
For the implementation which scales linearly in $\beta$, one has to store $L_{\text{Trotter}}/\texttt{NWrap}$ intermediate propagation matrices of dimension $N\times N$. For large lattices and/or low temperatures this dominates the total memory requirements that can exceed 2~GB memory for a sequential version.
At the heart of Monte Carlo schemes lies a random walk through the given configuration space. This is easily parallalized via MPI by associating one random walker to each MPI task. For each task, we start from a random configuration and have to invest the autocorrelation time $T_\mathrm{auto}$ to produce an equilibrated configuration.
Additionally we can also profit from an OpenMP parallelized version of the BLAS/LAPACK library for an additional speedup, which also effects equilibration overhead $N_\text{MPI}\times T_\text{auto} / N_\text{OMP}$, where $N_{\text{MPI}}$ is the number of cores and $N_{\text{OMP}}$ the number of OpenMP threads.
For a given number of independent measurements $N_\text{meas}$, we therefore need a wall-clock time given by
\begin{equation}\label{eqn:scaling}
T = \frac{T_\text{auto}}{N_\text{OMP}} \left( 1 + \frac{N_\text{meas}}{N_\text{MPI}} \right) \,.
\end{equation}
As we typically have $ N_\text{meas}/N_\text{MPI} \gg 1 $,
the speedup is expected to be almost perfect, in accordance with
the performance test results for the auxiliary field
QMC code on SuperMUC (see Fig.~\ref{fig_scaling} (left)).
For many problem sizes, 2~GB memory per MPI task (random walker) suffices such that we typically start as many MPI tasks as there are physical cores per node. Due to the large amount of CPU time spent in MKL routines, we do not profit from the hyper-threading option. For large systems, the memory requirement increases and this is tackled by increasing the amount of OpenMP threads to decrease the stress on the memory system and to simultaneously reduce the equilibration overhead (see Fig.~\ref{fig_scaling} (right)). For the displayed speedup, it was crucial to pin the MPI tasks as well as the OpenMP threads in a pattern which keeps the threads as compact as possible to profit from a shared cache. This also explains the drop in efficiency from 14 to 28 threads where the OpenMP threads are spread over both sockets.
We store the field configurations of the random walker as checkpoints, such that a long simulation can be easily split into several short simulations. This procedure allows us to take advantage of chained jobs using the dependency chains provided by the batch system.
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.6]{Figures/MPI_scaling_ALF_2.pdf}
\includegraphics[scale=0.6]{Figures/OMP_scaling_ALF_2.pdf}
\end{center}
\caption{\label{fig_scaling} MPI (left) and OpenMP (right) scaling behavior of the auxiliary field QMC code of the ALF project on SuperMUC (phase 2/Haswell nodes) at the LRZ in Munich.
The MPI performance data was normalized to 28 cores and was obtained using a problem size of $N_{\text{dim}}=400$. This is a medium to small system size that is the least favorable in terms of MPI synchronization effects.
The OpenMP performance data was obtained using a problem size of $N_{\text{dim}}=1296$. Employing 2 and 4 OpenMP threads introduces some synchronization/management overhead such that the per-core performance is slightly reduced, compared to the single thread efficiency. Further increasing the amount of threads to 7 and 14 keeps the efficiency constant. The drop in performance of the 28 thread configuration is due to the architecture as the threads are now spread over both sockets of the node. To obtain the above results, it was crucial to pin the processes in a fashion that keeps the OpenMP threads as compact as possible.}
\end{figure}
%Next to the entire computational time is spent in BLAS routines such that the performance of the code will depend on the particular implementation of this library.
%We have found that the code performs well, and that an efficient OpenMP version of the library can be obtained merely by loading the corresponding BLAS and LAPACK routines.
%\mycomment{MB: Do we want to say more about OpenMP here, i.e. that it can be useful when warm-up time is a problem (and getting many CPUs is not).
%In all other cases, the MPI parallelization is always better than the trivial OpenMP parallelization of library algos.}
......@@ -6,17 +6,17 @@
% !TEX root = Doc.tex
%-------------------------------------------------------------------------------------
\section{Running the code}\label{sec:running}
\subsection{Running the code}\label{sec:running}
%-------------------------------------------------------------------------------------
In this section we describe the steps to compile and run the code and to perform the error analysis of the data.
%-------------------------------------------------------------------------------------
\subsection{Compilation}
\subsubsection{Compilation}
%-------------------------------------------------------------------------------------
The environment variables are defined in the bash script \texttt{set\_env.sh} as follows:
\lstset{style=custombash}
\lstset{style=bash}
\begin{lstlisting}
# Description of PROGRAMMCONFIGURATION:
......@@ -24,8 +24,8 @@ The environment variables are defined in the bash script \texttt{set\_env.sh} as
# Setting nothing compiles without mpi.
# -DQRREF selects a reference implementation of the QR decomposition.
# Setting nothing selects system lapack for the QR decomposition.
# -DSTAB1 selects an alternative stabilization scheme.
# Setting nothing selects the default stabilizatiion
# -DSTAB1, DSTAB2 selects an alternative stabilization scheme.
# Setting nothing selects the default stabilizatition
PROGRAMMCONFIGURATION=""
f90="gfortran"
export f90
......@@ -72,11 +72,11 @@ cd ..
\end{enumerate}
%-------------------------------------------------------------------------------------
\subsection{Starting a simulation}
\subsubsection{Starting a simulation}
%-------------------------------------------------------------------------------------
To start a simulation from scratch, the following files have to be present: \texttt{parameters} and \texttt{seeds}.
To run a single-thread simulation for one of the Hubbard model described in Sec.~\ref{sec:walk1} - \ref{sec:walk2}, issue the command
To run a single-thread simulation, for example by using the parameters of one of the Hubbard models described in Sec.~\ref{sec:ex}, issue the command
\begin{verbatim}
./Prog/Examples.out
\end{verbatim}
......@@ -84,10 +84,10 @@ To restart the code using an existing simulation as a starting point, first run
the input configuration files.
%-------------------------------------------------------------------------------------
\subsection{Error analysis}
\subsubsection{Error analysis}
%-------------------------------------------------------------------------------------
To perform an error analysis, based on the jackknife scheme, of the Monte Carlo bins for all observables run the script \texttt{analysis.sh}
To perform an error analysis (based on the jackknife scheme) of the Monte Carlo bins for all observables run the script \texttt{analysis.sh}
(see Sec.~\ref{sec:analysis}).
......
% !TEX root = doc.tex
% Copyright (c) 2016 The ALF project.
% This is a part of the ALF project documentation.
% The ALF project documentation by the ALF contributors is licensed
% under a Creative Commons Attribution-ShareAlike 4.0 International License.
% For the licensing details of the documentation see license.CCBYSA.
%
%------------------------------------------------------------
\subsection{Monte Carlo sampling}\label{sec:sampling}
%------------------------------------------------------------
%
The default updating scheme consists of local moves which change (upon acceptance) only one entry of $L_{\mathrm{Trotter}}(M_I+M_V)$ fields (see Sec. \ref{sec:updating}).
To generate an independent configuration $C$, one has to visit at least each field once. Our unit of \textit{sweeps} is defined such that each field is visited twice in a sequential propagation from $\tau = 0$ to $\tau = L_{\text{ Trotter}}$ and back. A single sweep will generically not suffice to produce an independent configuration.
% This is however only the lower bound as there can be a region in the spin space where the fields are correlated and it requires a larger or even global move to significantly change the configuration to an independent one. One might imagine a ferromagnet due to spontaneous symmetry breaking. All spins are parallel aligned and, let' say, point upwards. The configuration of only down spins is equally justified, but rotating one to the other requires a global operation. Flipping the spins individually one after another generates intermediate states of relative high energy which corresponds to a low probability in the QMC algorithm.
In fact, the autocorrelation time $T_\mathrm{auto}$ characterizes the required time scale to generate an independent configuration or values $\langle\langle\hat{O}\rangle\rangle_C$ for the observable $O$.
This has several consequences for the Monte Carlo simulation: