#!/bin/bash #PBS -l nodes=1:ppn=1 #PBS -l walltime=1:00:00 #PBS -N example_job cd $PBS_O_WORKDIR # Add the module, just in case module add DMTCP # DMTCP communicates with our tasks through a socket. It uses port number # 7779 by default, but if there are several DMTCP schedulers running on # the same node we will have problems. The best solution is to assign the # port number manually. Also, if PORT=0, a random unused port will be # chosen, which is probably better. PORT=7745 # Check if there is a previous script and run it if it's the case # If not, just tell dmtcp_launch to run our script for us if [ -e "dmtcp_restart_script.sh" ] then # We launch the restart script with the port number and also # with the name of the remote node. If the first time it was run # on node n1 but upon restart we are assigned node n2, it will # fail unless we add that switch. ./dmtcp_restart_script.sh -p ${PORT} -h $(hostname) else # The -i switch tells dmtcp_launch the time in seconds between # checkpoints. Probably 60 is too small, so set it up accordingly. dmtcp_launch -i 60 -p ${PORT} ./my_job fi