kopecz@euler[~]>> ssh kopecz@its-cs1.its.uni-kassel.de kopecz@its-cs1.its.uni-kassel.de's password: Last login: Mon Feb 22 13:10:16 2016 from euler.mathematik.uni-kassel.de ##################################################################################### L I N U X C L U S T E R * * * U N I K A S S E L ************************************************************************************* module avail -> zeigt alle verfuegbaren Module an module list -> zeigt alle aktivierten Module an module load -> laedt ein Modul module unload -> entlaedt ein Modul ************************************************************************************* module avail -> shows all available modules module list -> lists loaded modules module load -> loads a module module unload -> unloads a module ##################################################################################### Currently Loaded Modulefiles: 1) pgi/14.3 7) jdk/1.8u65 13) qepcad/B.1.69 2) gcc/4.9.0 8) R/2.15.1 14) saclib/2.2.5 3) intel/14.0.3 9) x10/2.1.2 15) simplify/1.18 4) Maple/18 10) mpi/openmpi/1.8.1/gcc-4.9 16) sage/4.8 5) Matlab/R2015b 11) polymake/2.9.9 17) CalculiX/2.5 6) Mathematica/10.2 12) abaqus/6.11-3 18) nag/6.0 kopecz@its-cs1:/home/users/0026/kopecz>
kopecz@its-cs1:/home/users/0026/kopecz> module load mpi/openmpi/1.8.1/gcc-4.9 kopecz@its-cs1:/home/users/0026/kopecz>
kopecz@its-cs1:/home/fb17/kopecz> pwd /home/fb17/kopecz
kopecz@its-cs1:/home/fb17/kopecz> ls kopecz@its-cs1:/home/fb17/kopecz>
kopecz@its-cs1:/home/fb17/kopecz> echo $SHELL /bin/bash
kopecz@its-cs1:/home/users/0026/kopecz> mkdir hello_serial kopecz@its-cs1:/home/users/0026/kopecz> cd hello_serial
kopecz@its-cs1:/home/users/0026/kopecz/hello_serial> wget http://www.mathematik.uni-kassel.de/~kopecz/cluster/hello_serial.c --2016-02-19 14:39:07-- http://www.mathematik.uni-kassel.de/~kopecz/cluster/hello_serial.c Resolving www.mathematik.uni-kassel.de... 141.51.166.147 Connecting to www.mathematik.uni-kassel.de|141.51.166.147|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 184 [text/plain] Saving to: “hello_serial.c” 100%[==============================================================>] 184 --.-K/s in 0s 2016-02-19 14:39:07 (30.0 MB/s) - “hello_serial.c” saved [184/184]
kopecz@its-cs1:/home/users/0026/kopecz/hello_serial> ls hello_serial.c kopecz@its-cs1:/home/users/0026/kopecz/hello_serial>
kopecz@its-cs1:/home/users/0026/kopecz/hello_serial> gcc hello_serial.c -o hello_serial kopecz@its-cs1:/home/users/0026/kopecz/hello_serial>
kopecz@its-cs1:/home/users/0026/kopecz/hello_serial> ls hello_serial hello_serial.c kopecz@its-cs1:/home/users/0026/kopecz/hello_serial>
kopecz@its-cs1:/home/users/0026/kopecz/hello_serial> ./hello_serial Hello world from its-cs1.its.uni-kassel.de kopecz@its-cs1:/home/users/0026/kopecz/hello_serial>
kopecz@its-cs1:/home/fb17/kopecz> mkdir hello_parallel kopecz@its-cs1:/home/fb17/kopecz> cd hello_parallel
kopecz@its-cs1:/home/fb17/kopecz/trap> wget www.mathematik.uni-kassel.de/~kopecz/cluster/hello_parallel.c --2012-04-17 15:10:51-- http://www.mathematik.uni-kassel.de/~kopecz/hello_parallel.c Resolving www.mathematik.uni-kassel.de... 141.51.166.147 Connecting to www.mathematik.uni-kassel.de|141.51.166.147|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 484 [text/plain] Saving to: "hello_parallel.c" 100%[=============================================================================================================>] 484 --.-K/s in 0s 2012-04-17 15:10:51 (107 MB/s) - "hello_parallel.c" saved [484/484]
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> ls hello_parallel.c kopecz@its-cs1:/home/fb17/kopecz/hello_parallel>
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> mpicc hello_parallel.c -o hello_parallel kopecz@its-cs1:/home/fb17/kopecz/hello_parallel>
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> ls hello_parallel hello_parallel.c kopecz@its-cs1:/home/fb17/kopecz/hello_parallel>
#!/bin/sh # job name: #SBATCH --job-name=Hello_pa # write output and errors to files: #SBATCH --output=slurm.out #SBATCH --error=slurm.err # requested wall clock time in HH:MM:SS: #SBATCH --time=00:05:00 # requested memory per process in MB: #SBATCH --mem-per-cpu=100 # requested partition: #SBATCH --partition=minijobs # requested number of nodes and processes per node: #SBATCH --nodes=2 #SBATCH --tasks-per-node=2 # use exclusive option only for performance studies! #SBATCH --exclusive # start program: mpirun ./hello_parallel
job-name | The job name will be displayed in the NAME column of squeue's output. See the discussion of squeue below. |
output | The program's output will be written to the file specified here. In this example it will be written to "slurm.out" in the current directory. |
error | Any error messages will be written to this file. In this example the file is called "slurm.err". |
time | Set a limit on the total run time of the job allocation. Here the used time format is "hours:minutes:seconds", so in this tutorial the job is expected to finish within 5 minutes. For other possible time formats see the manpage of sbatch. |
mem-per-cpu | Minimum memory required per allocated CPU in Mega Bytes. Here we request 100 MB for each CPU. |
partition | Request a specific partition for the resource allocation. Valid partition names for this cluster are "minijobs" and "public". See also the discussion of sinfo and the constraint option below |
nodes | Request the specific number of nodes to be allocated for the job. Using the above script, two nodes will be allocated. |
tasks-per-node | Specify the number of tasks to be launched per node. With this script two tasks will be launched per note. |
exclusive | This option is used to guarantee exclusive access to the used nodes, which is indispensable for the creation of performance studies. It should be avoided for production runs as long as the job doesn't require a node's total memory. |
constraint | This option is important when performing computations on the public partition. In this partition there are nodes of different architectures, more precisely there are nodes with 4 cores and 12 cores. To ensure that you only perform computations on nodes with 12 cores, add the --constraint=12cores option to your script. In the other case use --constraint=4cores. |
exclude | From time to time one of the cluster's nodes will be broken and will cause jobs including this node to abort. In such a case (or for any other reason) you can define a set of nodes that will be excluded from allocation for the job. For instance, --exclude=its-cs102,its-cs110 will exclude the nodes its-cs102 and its-cs110. |
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> sbatch mpihello.slurm Submitted batch job 424
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> ls hello_parallel hello_parallel.c mpihello.slurm slurm.err slurm.out
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> cat slurm.err kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> cat slurm.out Hello from process 002 out of 004, processor name its-cs101.its.uni-kassel.de Hello from process 000 out of 004, processor name its-cs100.its.uni-kassel.de Hello from process 001 out of 004, processor name its-cs100.its.uni-kassel.de Hello from process 003 out of 004, processor name its-cs101.its.uni-kassel.de
kopecz@its-cs1:/home/users/0026/kopecz> mkdir hello_send_recv kopecz@its-cs1:/home/users/0026/kopecz> cd hello_send_recv/ kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> make mpicc -g -O3 -c nodes_used.c -o nodes_used.o mpicc -g -O3 -c -o hello_send_recv.o hello_send_recv.c mpicc -g -O3 nodes_used.o hello_send_recv.o -o hello_send_recv -lm kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> ls hello_send_recv hello_send_recv.o nodes_used.c nodes_used.o hello_send_recv.c Makefile nodes_used.h kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> sbatch mpihello_send_recv.slurm Submitted batch job 5459501 kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> ls hello_send_recv Makefile nodes_used.c nodes_used.o hello_send_recv.c mpihello_send_recv2.slurm nodes_used.h slurm.err hello_send_recv.o mpihello_send_recv.slurm nodes_used.log slurm.out kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> cat slurm.err slurmd[its-cs199]: Unable to get current working directory: No such file or directory kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> cat slurm.out kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> cat nodes_used.log Process 0000 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0001 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0002 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0003 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0004 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0005 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0006 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0007 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0008 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0009 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0010 out of 0012 running on processor its-cs199.its.uni-kassel.de Process 0011 out of 0012 running on processor its-cs199.its.uni-kassel.de kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
nodes=1
and tasks-per-node=12
was specified.
nodes=4
and tasks-per-node=12
. Thus, we expect that a job, started with this script, was run on 4 nodes with 12 processes each.
kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> sbatch mpihello_send_recv2.slurm Submitted batch job 5459502 kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv> cat nodes_used.log Process 0000 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0001 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0002 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0003 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0004 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0005 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0006 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0007 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0008 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0009 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0010 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0011 out of 0048 running on processor its-cs202.its.uni-kassel.de Process 0012 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0013 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0014 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0015 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0016 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0017 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0018 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0019 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0020 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0021 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0022 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0023 out of 0048 running on processor its-cs203.its.uni-kassel.de Process 0024 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0025 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0026 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0027 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0028 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0029 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0030 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0031 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0032 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0033 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0034 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0035 out of 0048 running on processor its-cs204.its.uni-kassel.de Process 0036 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0037 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0038 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0039 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0040 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0041 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0042 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0043 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0044 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0045 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0046 out of 0048 running on processor its-cs205.its.uni-kassel.de Process 0047 out of 0048 running on processor its-cs205.its.uni-kassel.de kopecz@its-cs1:/home/users/0026/kopecz/hello_send_recv>
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> sbatch mpihello.slurm; squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 425 minijobs Hello_pa kopecz R 0:00 2 its-cs[100-101] kopecz@its-cs1:/home/fb17/kopecz/hello_parallel>
kopecz@its-cs1:/home/fb17/kopecz/poisson/study4> squeue -o "%.7i %.9P %.8j %.8u %.2t %.9M %.6D %.8c %.4C %.7h %R" JOBID PARTITION NAME USER ST TIME NODES MIN_CPUS CPUS SHARED NODELIST(REASON) 8509 public poisson kopecz PD 0:00 8 8 64 no (Resources) 8587 public poisson kopecz PD 0:00 8 8 64 no (Priority) 8570 public poisson kopecz PD 0:00 8 4 32 no (Priority) 8507 public poisson kopecz PD 0:00 8 2 16 no (Priority) 8506 public poisson kopecz PD 0:00 8 1 8 no (Priority) 8434 public mpi zier R 15:26:47 1 1 12 no its-cs224 8435 public mpi2 zier R 15:26:47 1 1 12 no its-cs225 8433 public static zier R 18:50:17 1 1 12 no its-cs229
kopecz@its-cs1:/home/fb17/kopecz/hello_parallel> sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST thphysik up infinite 3 down* its-cs[116,182,184] thphysik up infinite 8 idle its-cs[117-120,158-159,179-180] public* up 10-00:00:0 3 down* its-cs[10,102,208] public* up 10-00:00:0 10 idle its-cs[103-105,145-146,148,219-222] minijobs up 5:00 2 idle its-cs[100-101]