|
|
(5 intermediate revisions by one user not shown) |
Line 20: |
Line 20: |
| <SAFplus_installation_dir>/src/examples/eval/src/app/csa104Comp | | <SAFplus_installation_dir>/src/examples/eval/src/app/csa104Comp |
| | | |
− | All messaging code has been isolated into a single module that consists of 2 files: msgFns.c and msgFns.h to increase readability. These files provide the following APIs.
| + | To increase readability, all messaging code has been isolated into a single module that consists of 2 files: msgFns.c and msgFns.h. These files provide the following APIs. |
| | | |
| | | |
Line 34: |
Line 34: |
| |} | | |} |
| | | |
− | These APIs constitute the basic operations of any messaging library; initialize, open, send and receive. | + | These APIs constitute the basic operations required by any application that uses messaging; initialize, open, send and receive. |
| | | |
| The following constants are also defined: | | The following constants are also defined: |
Line 218: |
Line 218: |
| ====Putting it all Together==== | | ====Putting it all Together==== |
| | | |
− | | + | These functions are called from the application's <code>clCompAppMain.c</code> file to implement an application that periodically sends a message from the standby to the active application. First we define a helper function that will repeatedly send messages. This function will loop so long as the application is standby, as indicated by a global variable "standby" that is set in the work assignment callback. |
− | | + | |
− | | + | |
− | The above software in <code>clCompAppMain.c</code> is new to csa103. The call to <code>checkpoint_initialize</code> is pretty straightforward. This function discussed in detail later within this section. The call to <code>checkpoint_read_seq</code> is also pretty straightforward. It reads the current sequence value from the checkpoint (at this point it should be zero) and loads it into the variable: <code>seq</code>. It is important to note the call to <code>checkpont_finalize</code> if the call to <code>checkpoint_read_seq</code> does not return <code>CL_OK</code>. This call closes the checkpoint and cleanly finalizes the checkpoint library. We'll look more at <code>checkpoint_read_seq</code> and <code>checkpoint_finalize</code> later.
| + | |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
Line 227: |
Line 224: |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | /* Checkpoint new sequence number */
| + | void* senderLoop(void* p) |
− | rc = checkpoint_write_seq(seq);
| + | { |
− | if (rc != SA_AIS_OK) | + | int count =0; |
− | { | + | char msg[100]; |
− | clprintf(CL_LOG_SEV_ERROR,"%s: ERROR: Checkpoint write failed. Exiting.", appname);
| + | while (standby) |
− | break;
| + | { |
− | } | + | count++; |
| + | snprintf(msg,99,"Msg %4d from %.*s",count,appName.length,appName.value); |
| + | |
| + | clprintf(CL_LOG_SEV_INFO,"csa104: Sending Message: %s",msg); |
| + | msgSend(ACTIVE_COMP_QUEUE,msg,strlen(msg)+1); |
| + | sleep(2); |
| + | } |
| + | return NULL; |
| + | } |
| </pre></code> | | </pre></code> |
| |} | | |} |
| | | |
− | The rest of the csa103's main loop is the same as the main loop in csa102, but the seven lines above are new. Here we write the new value of the checkpoint variable to the checkpoint section and print and error if this fails. We'll look closer at <code>checkpoint_write_seq</code> later.
| + | Next, we initialize the messaging library from main(): |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
Line 243: |
Line 248: |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | ClRcT
| + | ... |
− | clCompAppAMFCSISet( | + | /* |
− | ClInvocationT invocation,
| + | * Now register the component with AMF. At this point it is |
− | const ClNameT *compName,
| + | * ready to provide service, i.e. take work assignments. |
− | ClAmsHAStateT haState,
| + | */ |
− | ClAmsCSIDescriptorT csiDescriptor)
| + | |
− | {
| + | |
− | /*
| + | |
− | * ---BEGIN_APPLICATION_CODE---
| + | |
− | */
| + | |
− | ClCharT compname[100]={0};
| + | |
| | | |
− | /* | + | if ( (rc = saAmfComponentNameGet(amfHandle, &appName)) != SA_AIS_OK) |
− | * ---END_APPLICATION_CODE---
| + | goto errorexit; |
− | */
| + | if ( (rc = saAmfComponentRegister(amfHandle, &appName, NULL)) != SA_AIS_OK) |
| + | goto errorexit; |
| | | |
− | /*
| + | /* |
− | * Print information about the CSI Set
| + | * Initialize the log stream |
− | */
| + | */ |
− | strncpy(compname, compName->value, compName->length);
| + | clEvalAppLogStreamOpen((ClCharT*)appName.value, &gEvalLogStream); |
| | | |
− | clprintf (CL_LOG_SEV_INFO, "Component [%s] : PID [%d]. CSI Set Received",
| + | msgInitialize(); |
− | compname, (int)mypid);
| + | |
− | | + | |
− | clCompAppAMFPrintCSI(csiDescriptor, haState);
| + | |
| | | |
− | /*
| + | /* |
− | Take appropriate action based on state
| + | * Print out standard information for this component. |
− | */
| + | */ |
− | | + | |
− | switch ( haState )
| + | |
− | {
| + | |
− | case CL_AMS_HA_STATE_ACTIVE:
| + | |
− | {
| + | |
− | /*
| + | |
− | AMF has requested application to take the active HA state
| + | |
− | for the CSI.
| + | |
− | */
| + | |
− |
| + | |
− | /*
| + | |
− | ---BEGIN_APPLICATION_CODE---
| + | |
− | */
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
− | | + | |
− | | + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | clprintf(CL_LOG_SEV_INFO,"%s: Active state requested from state %d",
| + | |
− | appname, ha_state);
| + | |
− | | + | |
− | if (ha_state == SA_AMF_HA_STANDBY)
| + | |
− | {
| + | |
− | /* Read checkpoint, make our replica the active replica */
| + | |
− | clprintf(CL_LOG_SEV_INFO,"%s reading checkpoint", appname);
| + | |
− | rc = checkpoint_read_seq(&seq);
| + | |
− | clprintf(CL_LOG_SEV_INFO,"%s read checkpoint: seq = %u", appname, seq);
| + | |
− | }
| + | |
− | checkpoint_replica_activate();
| + | |
− | ha_state = SA_AMF_HA_ACTIVE;
| + | |
− | | + | |
− | | + | |
− | /*
| + | |
− | * ---END_APPLICATION_CODE---
| + | |
− | */
| + | |
− | | + | |
− | clCpmResponse(cpmHandle, invocation, CL_OK);
| + | |
− | break;
| + | |
− | }
| + | |
− | | + | |
− | case CL_AMS_HA_STATE_STANDBY:
| + | |
− | {
| + | |
− | /*
| + | |
− | * AMF has requested application to take the standby HA state
| + | |
− | * for this CSI.
| + | |
− | */
| + | |
− | | + | |
− | /*
| + | |
− | * ---BEGIN_APPLICATION_CODE---
| + | |
− | */
| + | |
− | | + | |
− | clprintf(CL_LOG_SEV_INFO," Standby state requested from state %d",ha_state);
| + | |
− |
| + | |
− | ha_state = SA_AMF_HA_STANDBY;
| + | |
− | | + | |
− | /*
| + | |
− | * ---END_APPLICATION_CODE---
| + | |
− | */
| + | |
− | | + | |
− | clCpmResponse(cpmHandle, invocation, CL_OK);
| + | |
− | break;
| + | |
− | }
| + | |
| | | |
| + | clEoMyEoIocPortGet(&iocPort); |
| + | ... |
| </pre></code> | | </pre></code> |
| |} | | |} |
| | | |
| | | |
| + | In the work assignment callback, we open the queue and spawn the message receiver thread when an active assignment is received, and spawn a message sender thread when a standby assignment is received. |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
| ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c | | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c |
| |- | | |- |
− | |<code><pre> | + | |<code><pre> |
− | case CL_AMS_HA_STATE_QUIESCED:
| + | void clCompAppAMFCSISet(SaInvocationT invocation, |
− | {
| + | const SaNameT *compName, |
− | /*
| + | SaAmfHAStateT haState, |
− | * AMF has requested application to quiesce the CSI currently
| + | SaAmfCSIDescriptorT csiDescriptor) |
− | * assigned the active or quiescing HA state. The application
| + | { |
− | * must stop work associated with the CSI immediately.
| + | /* |
− | */
| + | * Print information about the CSI Set |
| + | */ |
| | | |
− | /*
| + | clprintf (CL_LOG_SEV_INFO, "Component [%.*s] : PID [%d]. CSI Set Received\n", |
− | * ---BEGIN_APPLICATION_CODE---
| + | compName->length, compName->value, mypid); |
− | */
| + | |
| | | |
− | clprintf(CL_LOG_SEV_INFO,"%s: QUIESCED", appname);
| + | clCompAppAMFPrintCSI(csiDescriptor, haState); |
− | ha_state = haState;
| + | |
| | | |
− | /*
| + | /* |
− | * ---END_APPLICATION_CODE---
| + | * Take appropriate action based on state |
− | */
| + | */ |
| | | |
− | clCpmResponse(cpmHandle, invocation, CL_OK);
| + | switch ( haState ) |
− | break;
| + | |
− | }
| + | |
− | | + | |
− | case CL_AMS_HA_STATE_QUIESCING:
| + | |
− | {
| + | |
− | /*
| + | |
− | * AMF has requested application to quiesce the CSI currently
| + | |
− | * assigned the active HA state. The application must stop work
| + | |
− | * associated with the CSI gracefully and not accept any new
| + | |
− | * workloads while the work is being terminated.
| + | |
− | */
| + | |
− | | + | |
− | /*
| + | |
− | * ---BEGIN_APPLICATION_CODE---
| + | |
− | */
| + | |
− | | + | |
− | clprintf(CL_LOG_SEV_INFO,"%s: QUIESCING", appname);
| + | |
− | ha_state = haState;
| + | |
− | | + | |
− | /*
| + | |
− | * ---END_APPLICATION_CODE---
| + | |
− | */
| + | |
− | | + | |
− | clCpmCSIQuiescingComplete(cpmHandle, invocation, CL_OK);
| + | |
− | break;
| + | |
− | }
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
− | | + | |
− | | + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | default:
| + | |
− | {
| + | |
− | break;
| + | |
− | }
| + | |
− | }
| + | |
− |
| + | |
− | return CL_OK;
| + | |
− | }
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
− | | + | |
− | | + | |
− | Now looking at csa103's implementation of <code>clCompAppAMFCSISet</code>, we see that rather than just a few cases, we handle several cases: <code>QUIESCING</code>, <code>QUIESCED</code>, <code>ACTIVE</code>, and <code>STANDBY</code>. In each state, the global variable <code>ha_state</code> to the new_state value that is passed in is set. This is similar to csa102, except that we add the feature that in the <code>CL_AMS_HA_STATE_ACTIVE</code> case if our previous state was <code>CL_AMS_HA_STATE_STANDBY</code> we read the sequence from the checkpoint so that the main loop will pick it up.
| + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | static ClRcT
| + | |
− | checkpoint_initialize()
| + | |
| { | | { |
− | SaAisErrorT rc = CL_OK; | + | case SA_AMF_HA_ACTIVE: |
− | SaVersionT ckpt_version = {'B', 1, 1};
| + | |
− | SaNameT ckpt_name = { strlen(CKPT_NAME), CKPT_NAME };
| + | |
− | ClUint32T seq_no;
| + | |
− | SaCkptCheckpointCreationAttributesT create_atts = {
| + | |
− | .creationFlags = SA_CKPT_WR_ACTIVE_REPLICA_WEAK |
| + | |
− | SA_CKPT_CHECKPOINT_COLLOCATED,
| + | |
− | .checkpointSize = sizeof(ClUint32T),
| + | |
− | .retentionDuration = (ClTimeT)10,
| + | |
− | .maxSections = 2, // two sections
| + | |
− | .maxSectionSize = sizeof(ClUint32T),
| + | |
− | .maxSectionIdSize = (ClSizeT)64
| + | |
− | | + | |
− |
| + | |
− | };
| + | |
− | SaCkptSectionCreationAttributesT section_atts = {
| + | |
− | .sectionId = &ckpt_sid,
| + | |
− | .expirationTime = SA_TIME_END
| + | |
− | | + | |
− | };
| + | |
− |
| + | |
− | clprintf(CL_LOG_SEV_INFO,"%s: checkpoint_initialize", appname);
| + | |
− | /* Initialize checkpointing service instance */
| + | |
− | rc = saCkptInitialize(&ckpt_svc_handle, /* Checkpoint service handle */
| + | |
− | NULL, /* Optional callbacks table */
| + | |
− | &ckpt_version); /* Required verison number */
| + | |
− | if (rc != SA_AIS_OK)
| + | |
| { | | { |
− | clprintf(CL_LOG_SEV_ERROR,"%s: ERROR: Failed to initialize checkpoint service", | + | /* |
− | appname);
| + | * AMF has requested application to take the active HA state |
− | return rc; | + | * for the CSI. |
| + | */ |
| + | pthread_t thr; |
| + | |
| + | clprintf(CL_LOG_SEV_INFO,"csa104: ACTIVE state requested; activating message queue receiver service"); |
| + | running = 1; |
| + | msgOpen(ACTIVE_COMP_QUEUE,QUEUE_LENGTH); |
| + | pthread_create(&thr,NULL,msgReceiverLoop,NULL); |
| + | |
| + | saAmfResponse(amfHandle, invocation, SA_AIS_OK); |
| + | break; |
| } | | } |
− | clprintf(CL_LOG_SEV_INFO,"%s: Checkpoint service initialized (handle=0x%llx)",
| |
− | appname, ckpt_svc_handle);
| |
− | </pre></code>
| |
− | |}
| |
− |
| |
− | Here is <code>checkpoint_initialize</code>. Here we initialize the <code>ClNameT</code> structure that holds the name of the checkpoint to be opened/created with the line: <code>ClNameT ckpt_name = { strlen(CKPT_NAME), CKPT_NAME };</code>. We then define a set of attributes to associate with the checkpoint when creating that checkpoint. The <code>creationFlags</code> includes <code>CL_CKPT_WR_ACTIVE_REPLICA</code>. This means that the checkpoint to be created will be asynchronous, or once active server updation is completed, the call returns to the application, all other replica updates happens parallel. <code>CheckpointSize</code> is set to the sizeof a 32 bit unsigned integer, which is the sequence number we print in the main loop. The <code>retentionDuration</code> is the time that the checkpoint service will keep the checkpoint in store after the last client has closed the checkpoint. <code>MaxSections</code> is 2 as this checkpoint will be used to store two sections. <code>MaxSectionSize</code> is the set to the sizeof a 32 bit unsigned integer, as application stores only unsigned integer. And <code>maxSectionIdSize</code> is 64 bytes.
| |
− |
| |
− | We next use <code>ClCkptSectionCreationAttributesT section_atts</code> to define the creation attributes for the sole section of the checkpoint. The <code>sectionId</code> is just the name of the section along with the number of bytes in the name. The <code>expirationTime</code> is set to <code>CL_TIME_END</code> to imply that the section is never deleted automatically if the checkpoint itself still remains.
| |
− |
| |
− | The call to <code>clCkptInitialize</code> has to be called before any other checkpoint functions. The <code>ckpt_svc_handle</code> is where the handle is returned. The handle must be passed to some future checkpoint api calls, namely the <code>clCkptCheckpointOpen</code> and <code>clCkptFinalize</code> calls. The callbacks table is passed as <code>NULL</code> which implies that our application doesn't provide any callbacks. Finally the <code>ckpt_version</code> identifies what version of the api this application is written to.
| |
− |
| |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| |
− | |-
| |
− | |<code><pre>
| |
− | //
| |
− | // Create the checkpoint for read and write.
| |
− |
| |
− | rc = saCkptCheckpointOpen(ckpt_svc_handle, // Service handle
| |
− | &ckpt_name, // Checkpoint name
| |
− | &create_atts, // Optional creation attr.
| |
− | (SA_CKPT_CHECKPOINT_READ |
| |
− | SA_CKPT_CHECKPOINT_WRITE |
| |
− | SA_CKPT_CHECKPOINT_CREATE),
| |
− | (SaTimeT)-1, // No timeout
| |
− | &ckpt_handle); // Checkpoint handle
| |
− |
| |
− | if (rc != SA_AIS_OK)
| |
| | | |
| + | case SA_AMF_HA_STANDBY: |
| { | | { |
− | clprintf(CL_LOG_SEV_ERROR,"%s: ERROR: Failed [0x%x] to open checkpoint", | + | /* |
− | appname, rc);
| + | * AMF has requested application to take the standby HA state |
− | (void)saCkptFinalize(ckpt_svc_handle); | + | * for this CSI. |
− | return rc; | + | */ |
− | }
| + | pthread_t thr; |
− | clprintf(CL_LOG_SEV_INFO,"%s: Checkpoint opened (handle=0x%llx)", appname, ckpt_handle);
| + | clprintf(CL_LOG_SEV_INFO,"csa104: Standby state requested"); |
− | </pre></code>
| + | running = 0; |
− | |}
| + | standby = 1; |
− |
| + | pthread_create(&thr,NULL,senderLoop,NULL); |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | /*
| + | |
− | * Try to create a section so that updates can operate by overwriting
| + | |
− | * the section over and over again.
| + | |
− | * If subsequent processes come through here, they will fail to create
| + | |
− | * the section. That is OK, even though it will cause an error message
| + | |
− | * If the section create fails because the section is already there, then
| + | |
− | * read the sequence number
| + | |
− | */
| + | |
− | // Put data in network byte order
| + | |
− | seq_no = htonl(seq);
| + | |
| | | |
− | // Creating the section
| + | saAmfResponse(amfHandle, invocation, SA_AIS_OK); |
− | checkpoint_replica_activate();
| + | break; |
− | rc = saCkptSectionCreate(ckpt_handle, // Checkpoint handle
| + | |
− | §ion_atts, // Section attributes
| + | |
− | (SaUint8T*)&seq_no, // Initial data
| + | |
− | (SaSizeT)sizeof(seq_no)); // Size of data
| + | |
− | if (rc != SA_AIS_OK && (CL_GET_ERROR_CODE(rc) != SA_AIS_ERR_EXIST))
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"%s: ERROR: Failed to create checkpoint section", appname);
| + | |
− | (void)saCkptCheckpointClose(ckpt_handle);
| + | |
− | (void)saCkptFinalize(ckpt_svc_handle);
| + | |
− | return rc; | + | |
| } | | } |
− | else if (rc != SA_AIS_OK && (CL_GET_ERROR_CODE(rc) == SA_AIS_ERR_EXIST))
| + | ... |
− | {
| + | |
− | rc = checkpoint_read_seq(&seq);
| + | |
− | if (rc != CL_OK)
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"%s: ERROR: Failed [0x%x] to read checkpoint section",
| + | |
− | appname, rc);
| + | |
− | (void)saCkptCheckpointClose(ckpt_handle);
| + | |
− | (void)saCkptFinalize(ckpt_svc_handle);
| + | |
− | return rc;
| + | |
− | }
| + | |
− | }
| + | |
− | else
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_INFO,"%s: Section created", appname);
| + | |
− | }
| + | |
− | | + | |
− | return CL_OK;
| + | |
− | }
| + | |
| </pre></code> | | </pre></code> |
| |} | | |} |
| | | |
− | With <code>rc = clCkptCheckpointOpen</code> we attempt to open the specified checkpoint. When <code>checkpoint_initialize</code> is called the checkpoint may or may not exist. We attempt to open the checkpoint for READ/WRITE access.
| + | ===How to Run csa104 and What to Observe=== |
| | | |
− | Next we check if we created the checkpoint and then we need to create the section with a call to <code>clCkptSectionCreate</code>. We pass the checkpoint handle obtained from <code>clCkptCheckpointOpen</code>, the section attributes declared earlier, and the address of the initial value along with the size in bytes of the initial value.
| + | This sample application runs 2 processes on SCNodeI0 (first system controller) in all the hardware setups described at the beginning of this eval guide. While it is certainly possible to run messaging across multiple nodes, this single node configuration makes evaluation simpler. |
| | | |
− | We convert the initial value to network byte order by the code <code>seq_no = htonl(seq)</code>, before passing it to <code>clCkptSectionCreate</code>.
| + | csa104 is "enabled" by default when SAFplus is started so there is no need to enter the SAFplus Debug Console and change its program state. |
| | | |
− | If we do not create the section, then we read the current value in the checkpoint into our global seq variable.
| |
| | | |
− | Note that whenever we return an error return code from the function that we have either not opened the checkpoint/initialized the checkpoint service, or we have closed the checkpoint and finalized the checkpoint service with calls to <code>clCkptCheckpointClose</code> and <code>clCkptFinalize</code>.
| + | The following output is given when you run <code>tail -f</code> on the csa104 log files. For example: |
| + | <code><pre> |
| + | # ./eval start |
| + | # tail -f var/log/csa104CompI?Log.latest |
| + | </pre></code> |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c | + | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI?Log.latest |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | static ClRcT
| + | ==> var/log/csa104CompI0Log.latest <== |
− | checkpoint_finalize(void)
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00009 : INFO) Name value pairs : |
− | {
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00010 : INFO) HA state : [Active] |
− | SaAisErrorT rc;
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00011 : INFO) Active Descriptor : |
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00012 : INFO) Transition Descriptor : [1] |
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00013 : INFO) Active Component : [csa104CompI0] |
| + | Fri Jan 11 14:38:45.215 2013 (SCNodeI0.3301 : csa104Comp.---.---.00014 : INFO) csa104: ACTIVE state requested; activating message queue receiver service |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3301 : csa104Comp.---.---.00015 : INFO) Received Message : Msg 1 from csa104CompI1 |
| | | |
− | rc = saCkptCheckpointClose(ckpt_handle);
| + | Fri Jan 11 14:38:47.251 2013 (SCNodeI0.3301 : csa104Comp.---.---.00016 : INFO) Received Message : Msg 2 from csa104CompI1 |
− | if (rc != SA_AIS_OK)
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"%s: failed: [0x%x] to close checkpoint handle 0x%llx",
| + | |
− | appname, rc, ckpt_handle);
| + | |
− | }
| + | |
− | rc = saCkptFinalize(ckpt_svc_handle);
| + | |
− | if (rc != SA_AIS_OK)
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"%s: failed: [0x%x] to finalize checkpoint",
| + | |
− | appname, rc);
| + | |
− | }
| + | |
− | return CL_OK;
| + | |
| | | |
− | }
| |
− | </pre></code>
| |
− | |}
| |
| | | |
− | The above code snippet presents <code>checkpoint_finalize</code>. It's quite simple. First it closes the checkpoint handle with a call to <code>clCkptCheckpointClose</code>. Next it finalizes the checkpoint library with a call to <code>clCkptFinalize</code>.
| + | ==> var/log/csa104CompI1Log.latest <== |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00007 : INFO) CSI Flags : [Add One] |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00008 : INFO) CSI Name : [csa104CSII] |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00009 : INFO) Name value pairs : |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00010 : INFO) HA state : [Standby] |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00011 : INFO) Standby Descriptor : |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00012 : INFO) Standby Rank : [1] |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00013 : INFO) Active Component : [csa104CompI0] |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00014 : INFO) csa104: Standby state requested |
| + | Fri Jan 11 14:38:45.250 2013 (SCNodeI0.3302 : csa104Comp.---.---.00015 : INFO) csa104: Sending Message: Msg 1 from csa104CompI1 |
| + | Fri Jan 11 14:38:47.251 2013 (SCNodeI0.3302 : csa104Comp.---.---.00016 : INFO) csa104: Sending Message: Msg 2 from csa104CompI1 |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI0Log.latest <== |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | Fri Jan 11 14:38:49.252 2013 (SCNodeI0.3301 : csa104Comp.---.---.00017 : INFO) Received Message : Msg 3 from csa104CompI1 |
− | |-
| + | |
− | |<code><pre> | + | |
− | static ClRcT
| + | |
− | checkpoint_write_seq(ClUint32T seq)
| + | |
− | {
| + | |
− | SaAisErrorT rc = SA_AIS_OK;
| + | |
− | ClUint32T seq_no;
| + | |
| | | |
− | /* Putting data in network byte order */
| |
− | seq_no = htonl(seq);
| |
| | | |
− | /* Write checkpoint */
| + | ==> var/log/csa104CompI1Log.latest <== |
− | retry:
| + | Fri Jan 11 14:38:49.252 2013 (SCNodeI0.3302 : csa104Comp.---.---.00017 : INFO) csa104: Sending Message: Msg 3 from csa104CompI1 |
− | rc = saCkptSectionOverwrite(ckpt_handle,
| + | Fri Jan 11 14:38:51.253 2013 (SCNodeI0.3302 : csa104Comp.---.---.00018 : INFO) csa104: Sending Message: Msg 4 from csa104CompI1 |
− | &ckpt_sid,
| + | |
− | &seq_no,
| + | |
− | sizeof(ClUint32T));
| + | |
− | if (rc != SA_AIS_OK)
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"Failed [0x%x] to write to section", rc);
| + | |
− | if(rc == SA_AIS_ERR_NOT_EXIST)
| + | |
− | rc = checkpoint_replica_activate();
| + | |
− | if(rc == CL_OK) goto retry;
| + | |
− | }
| + | |
− | else
| + | |
− | {
| + | |
− | /*
| + | |
− | * Synchronize the checkpoint to all the replicas.
| + | |
− | */
| + | |
− | rc = saCkptCheckpointSynchronize(ckpt_handle, SA_TIME_END );
| + | |
− | if (rc != SA_AIS_OK)
| + | |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"Failed [0x%x] to synchronize the checkpoint", rc);
| + | |
− | }
| + | |
− | }
| + | |
| | | |
− | return CL_OK;
| + | ==> var/log/csa104CompI0Log.latest <== |
− | }
| + | Fri Jan 11 14:38:51.253 2013 (SCNodeI0.3301 : csa104Comp.---.---.00018 : INFO) Received Message : Msg 4 from csa104CompI1 |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | With <code>checkpoint_write_seq</code> we first convert the sequence number passed into network byte order. Then we pass it to <code>clCKptSectionOverwrite</code> which writes it to the checkpoint section created in <code>checkpoint_initialize</code>.
| |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI1Log.latest <== |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | Fri Jan 11 14:38:53.253 2013 (SCNodeI0.3302 : csa104Comp.---.---.00019 : INFO) csa104: Sending Message: Msg 5 from csa104CompI1 |
− | |-
| + | |
− | |<code><pre>
| + | |
− | static ClRcT
| + | |
− | checkpoint_read_seq(ClUint32T *seq)
| + | |
− | {
| + | |
− | ClRcT rc = CL_OK;
| + | |
− | ClUint32T err_idx; /* Error index in ioVector */
| + | |
− | ClUint32T seq_no = 0xffffffff;
| + | |
− | SaCkptIOVectorElementT iov = {
| + | |
− | .sectionId = ckpt_sid,
| + | |
− | .dataBuffer = (ClPtrT)&seq_no,
| + | |
− | .dataSize = sizeof(ClUint32T),
| + | |
− | .dataOffset = (ClOffsetT)0,
| + | |
− | .readSize = sizeof(ClUint32T)
| + | |
− | };
| + | |
| | | |
− | rc = saCkptCheckpointRead(ckpt_handle, &iov, 1, &err_idx);
| + | ==> var/log/csa104CompI0Log.latest <== |
− | if (rc != SA_AIS_OK)
| + | Fri Jan 11 14:38:53.254 2013 (SCNodeI0.3301 : csa104Comp.---.---.00019 : INFO) Received Message : Msg 5 from csa104CompI1 |
− | {
| + | |
− | clprintf(CL_LOG_SEV_ERROR,"Error: [0x%x] from checkpoint read, err_idx = %u",
| + | |
− | rc, err_idx);
| + | |
− | }
| + | |
| | | |
− | /* FIXME: How to process this err_idx? */
| |
− | *seq = ntohl(seq_no);
| |
| | | |
− | return CL_OK;
| + | ==> var/log/csa104CompI1Log.latest <== |
− | | + | Fri Jan 11 14:38:55.254 2013 (SCNodeI0.3302 : csa104Comp.---.---.00020 : INFO) csa104: Sending Message: Msg 6 from csa104CompI1 |
− | }
| + | |
| | | |
| + | ==> var/log/csa104CompI0Log.latest <== |
| + | Fri Jan 11 14:38:55.255 2013 (SCNodeI0.3301 : csa104Comp.---.---.00020 : INFO) Received Message : Msg 6 from csa104CompI1 |
| </pre></code> | | </pre></code> |
| |} | | |} |
| | | |
− | With <code>checkpoint_read_seq</code> we initialize <code>seq_no</code> to all ones just so it will be more obvious if the <code>clCkptCheckpointRead</code> call fails and the value of <code>seq_no</code> doesn't get overwritten. Then, we set up the <code>iov</code> variable. We set the <code>sectionID</code> field to <code>ckpt_sid</code> which is the id of the section created in <code>checkpoint_initialize</code>. The <code>dataBuffer</code> gets set to the address of the <code>seq_no</code> variable which is where we want the checkpoint data loaded. The <code>dataSize</code> field is set to the the size of the <code>seq_no</code> variable. <code>DataOfset</code> is set to zero since the sequence number is the only thing stored in the section, so it resides at the front of the section. <code>readSize</code> is set to the size of the sequence number variable.
| + | The logs show the work assignment occurring with csa104CompI0 as ACTIVE and csa104CompI1 as STANDBY. This causes the csa104CompI1 (standby) component to start sending messages and the csa104CompI0 (active) component to begin receiving them. |
| | | |
− | Within <code>checkpoint_read_seq</code> we pass the iovector to <code>clCkptCheckpointRead</code> along with the <code>ckpt_handle</code>, the constant 1, and the address of the <code>err_idx</code> variable. The constant 1 is the number of elements in the array of <code>ClCkptIOVectorElementT structs</code> that we're passing. That is, we're passing the address of a single <code>ClCkptIOVectorElementT struct</code>. The <code>err_idx</code> will hold the index in that array where <code>clCkptCheckpointRead</code> found an error. That is, if there is going to be an error, it will be at index 0 since we're only passing on entry in the <code>iovector</code>.
| + | [[File:OpenClovis_Note.png]]Sometimes the output shows a double send, double receive or even a receive before a send! This is an effect of the tail program polling the logs and not an actual issue. |
| | | |
| + | Next, find the active csa104 process and kill it. In this case, its the one that's receiving the messages. The process ID is available in the log which is formmated as follows: |
| + | date (Node .PID : compname.---.---.logCount : SEVERITY) Message |
| + | Fri Jan 11 14:38:55.255 2013 (SCNodeI0.3301 : csa104Comp.---.---.00020 : INFO) Received Message : Msg 6 from csa104CompI1 |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | So in the log above the active process is pid 3301. |
− | ! style="color:black;background-color:#ffccaa;" align="center"| clCompAppMain.c
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
| | | |
− | static ClRcT
| + | While keeping the tail running, open a new window and run the kill command: |
− | checkpoint_replica_activate(void)
| + | |
− | {
| + | |
− | SaAisErrorT rc = SA_AIS_OK;
| + | |
| | | |
− | if ((rc = saCkptActiveReplicaSet(ckpt_handle)) != SA_AIS_OK)
| |
− | {
| |
− | clprintf(CL_LOG_SEV_ERROR,
| |
− | "checkpoint_replica_activate failed [0x%x] in ActiveReplicaSet",
| |
− | rc);
| |
− | }
| |
− | else rc = CL_OK;
| |
− |
| |
− | return rc;
| |
− | }
| |
− | </pre></code>
| |
− | |}
| |
− |
| |
− | ===How to Run csa103 and What to Observe===
| |
− |
| |
− | This sample application can run on all Runtime Hardware Setups. The following, lists which node <code>csa103compI*</code> runs on.
| |
− | <ul>
| |
− | <li>'''Runtime Hardware Setup 1.1 and 2.1'''
| |
− | <br>csa103compI0 and csa103compI1 will run upon the single node.
| |
− | <li>'''Runtime Hardware Setup 1.2 and 2.2'''
| |
− | <br>csa103compI0 runs on PayloadNodeI0 and csa103compI1 runs on PayloadNodeI1
| |
− | <li>'''Runtime Hardware Setup 1.3 and 2.3'''
| |
− | <br>csa103compI2 and csa103compI3 run on SCNodeI0 and SCNodeI1 respectively. csa103compI0 and csa103compI1 run on PayloadNodeI0 and PayloadNodeI1 respectively.
| |
− | <br><br>In the four node set up SCNodeI0 and SCNodeI1 output data via <code>/root/asp/var/log/csa103CompI2</code> and <code>/root/asp/var/log/csa103CompI3</code> respectively. Therefore in this case follow the below instructions replacing csa103CompI0 and csa103compI1 with csa103compI2 and csa103compI3 respectively.
| |
− | </ul>
| |
− |
| |
− | We run csa103 very much the same way we run any of the rest of the sample applications: with the SAFplus Platform Console.
| |
− |
| |
− | <ol>
| |
− | <li>First, on the active System Controller move it to state LockAssignment with (Unlock csa203SGI0 instead of csa103SGI0 to run csa203)
| |
| <code><pre> | | <code><pre> |
− | cli[Test]-> setc 1
| + | # kill 3301 |
− | cli[Test:SCNodeI0]-> setc cpm
| + | |
− | cli[Test:SCNodeI0:CPM]-> amsLockAssignment sg csa103SGI0
| + | |
| </pre></code> | | </pre></code> |
| | | |
− | The following output is given when you run <code>tail -f</code> on the csa103 log files. For example:
| + | After killing the active component you should see lines in the log files like the following: |
− | <code><pre>
| + | |
− | # tail -f /root/asp/var/log/csa103CompI0.log
| + | |
− | </pre></code>
| + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI0Log.latest
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00030 : INFO)
| + | |
− | Component [csa103CompI0] : PID [15238]. Initializing
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00031 : INFO)
| + | |
− | IOC Address : 0x1
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00032 : INFO)
| + | |
− | IOC Port : 0x81
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00033 : INFO)
| + | |
− | csa103: Instantiated as component instance csa103CompI0.
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00034 : INFO)
| + | |
− | csa103CompI0: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00035 : INFO)
| + | |
− | csa103CompI0: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00036 : INFO)
| + | |
− | csa103CompI0: checkpoint_initialize
| + | |
− | Mon Jul 14 00:01:10 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00043 : INFO)
| + | |
− | csa103CompI0: Checkpoint service initialized (handle=0x1)
| + | |
− | Mon Jul 14 00:01:10 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00045 : INFO)
| + | |
− | csa103CompI0: Checkpoint opened (handle=0x2)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
− | | + | |
− | <code><pre>
| + | |
− | # tail -f /root/asp/var/log/csa103CompI1.log
| + | |
− | </pre></code>
| + | |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
Line 753: |
Line 428: |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00030 : INFO)
| + | root@gh-lubuntu1204:~/eval# tail -f var/log/csa104CompI?Log.latest |
− | Component [csa103CompI1] : PID [15234]. Initializing
| + | ==> var/log/csa104CompI1Log.latest <== |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00031 : INFO)
| + | Fri Jan 11 14:43:19.398 2013 (SCNodeI0.3302 : csa104Comp.---.---.00152 : INFO) csa104: Sending Message: Msg 138 from csa104CompI1 |
− | IOC Address : 0x1
| + | Fri Jan 11 14:43:21.399 2013 (SCNodeI0.3302 : csa104Comp.---.---.00153 : INFO) csa104: Sending Message: Msg 139 from csa104CompI1 |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00032 : INFO)
| + | |
− | IOC Port : 0x80
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00033 : INFO)
| + | |
− | csa103: Instantiated as component instance csa103CompI1.
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00034 : INFO)
| + | |
− | csa103CompI1: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00035 : INFO)
| + | |
− | csa103CompI1: Waiting for CSI assignment... | + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00036 : INFO)
| + | |
− | csa103CompI1: checkpoint_initialize
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00043 : INFO)
| + | |
− | csa103CompI1: Checkpoint service initialized (handle=0x1)
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00045 : INFO)
| + | |
− | csa103CompI1: Checkpoint opened (handle=0x2) | + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00053 : INFO)
| + | |
− | csa103CompI1: Section created
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | <li>Then, unlock the application by running
| + | ==> var/log/csa104CompI0Log.latest <== |
− | <code><pre> | + | Fri Jan 11 14:43:21.400 2013 (SCNodeI0.3301 : csa104Comp.---.---.00153 : INFO) Received Message : Msg 139 from csa104CompI1 |
− | cli[Test:SCNodeI0:CPM]-> amsUnlock sg csa103SGI0
| + | |
− | </pre></code>
| + | |
| | | |
− | In the application log files, you should see
| |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI1Log.latest <== |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI0Log.latest
| + | Fri Jan 11 14:43:23.400 2013 (SCNodeI0.3302 : csa104Comp.---.---.00154 : INFO) csa104: Sending Message: Msg 140 from csa104CompI1 |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:09:08 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00058 : INFO)
| + | |
− | Standby state requested from state 0
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI0Log.latest <== |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | Fri Jan 11 14:43:23.401 2013 (SCNodeI0.3301 : csa104Comp.---.---.00154 : INFO) Received Message : Msg 140 from csa104CompI1 |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:09:08 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00064 : INFO)
| + | |
− | csa103CompI1: Active state requested from state 0
| + | |
− | Mon Jul 14 00:09:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00065 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=0)
| + | |
− | Mon Jul 14 00:09:10 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1)
| + | |
− | Mon Jul 14 00:09:11 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=2)
| + | |
− | Mon Jul 14 00:09:12 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=3) | + | |
− | Mon Jul 14 00:09:13 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=4) | + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | <li>Next, find the active csa103 process, the one that's printing the Hello World lines and kill it. To find the process ID issue the following command from a bash shell.
| + | Fri Jan 11 14:43:25.158 2013 (SCNodeI0.4069 : csa104Comp.---.---.00001 : INFO) Component [csa104CompI0] : PID [4069]. Initializing |
− | <code><pre>
| + | |
− | # ps -eaf | grep csa103
| + | |
− | </pre></code>
| + | |
− | This should produce an output that looks similar to the following.
| + | |
− | root 17830 15663 0 14:21 ? 00:00:00 csa103Comp -p
| + | |
− | root 17839 15663 0 14:21 ? 00:00:00 csa103Comp -p
| + | |
− | root 18558 16145 0 14:32 pts/4 00:00:00 grep csa103
| + | |
− | Notice the two entries that end with csa103Comp -p. These are our two component processes. The first one is usually the active process. This is the one that we will kill. In this case the process ID is 17830. So to kill the active component you issue the command:
| + | |
| | | |
− | # kill -9 17830
| + | Fri Jan 11 14:43:25.158 2013 (SCNodeI0.4069 : csa104Comp.---.---.00002 : INFO) IOC Address : 0x1 |
| | | |
− | [[File:OpenClovis_Note.png]]If this step does not result in the active component being killed then it is likely that the standby component was killed. In this case simply try killing the other process.
| + | Fri Jan 11 14:43:25.158 2013 (SCNodeI0.4069 : csa104Comp.---.---.00003 : INFO) IOC Port : 0x89 |
| | | |
− | After killing the active component you should see lines in the log files like the following:
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00004 : INFO) csa102: Instantiated as component instance csa104CompI0. |
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00005 : INFO) csa104CompI0: Waiting for CSI assignment... |
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00006 : INFO) Component [csa104CompI0] : PID [4069]. CSI Set Received |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00007 : INFO) CSI Flags : [Add One] |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00008 : INFO) CSI Name : [csa104CSII] |
− | |-
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00009 : INFO) HA state : [Active] |
− | |<code><pre>
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00010 : INFO) Active Descriptor : |
− | Mon Jul 14 00:16:43 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00515 : INFO)
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00011 : INFO) Transition Descriptor : [1] |
− | csa103CompI1: Hello World! (seq=452)
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00012 : INFO) Active Component : [csa104CompI0] |
− | Mon Jul 14 00:16:44 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00516 : INFO)
| + | Fri Jan 11 14:43:25.159 2013 (SCNodeI0.4069 : csa104Comp.---.---.00013 : INFO) csa104: ACTIVE state requested; activating message queue receiver service |
− | csa103CompI1: Hello World! (seq=453)
| + | |
− | Mon Jul 14 00:16:45 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00517 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=454)
| + | |
− | Mon Jul 14 00:16:46 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00518 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=455)
| + | |
− | </pre></code> | + | |
− | |}
| + | |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI1Log.latest <== |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /var/log/csa103CompI0Log.latest
| + | Fri Jan 11 14:43:25.401 2013 (SCNodeI0.3302 : csa104Comp.---.---.00155 : INFO) csa104: Sending Message: Msg 141 from csa104CompI1 |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00065 : INFO)
| + | |
− | csa103CompI0: Active state requested from state 2
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI0 reading checkpoint
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI0 read checkpoint: seq = 456
| + | |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=456)
| + | |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=457)
| + | |
− | Mon Jul 14 00:16:51 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00070 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=458) | + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | Where we can see CompI1 printing 455 and then dieing, where upon CompI0 gets the notice to take over processing, reads the checkpoint and then takes over with seq=456 and so on.
| + | ==> var/log/csa104CompI0Log.latest <== |
| + | Fri Jan 11 14:43:25.401 2013 (SCNodeI0.4069 : csa104Comp.---.---.00014 : INFO) Received Message : Msg 141 from csa104CompI1 |
| | | |
− | Then, in the CompI1 log file we see:
| + | Fri Jan 11 14:43:27.403 2013 (SCNodeI0.4069 : csa104Comp.---.---.00015 : INFO) Received Message : Msg 142 from csa104CompI1 |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| |
− | |-
| |
− | |<code><pre>
| |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00044 : INFO)
| |
− | csa103: Instantiated as component instance csa103CompI1.
| |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00045 : INFO)
| |
− | csa103CompI1: Waiting for CSI assignment...
| |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00046 : INFO)
| |
− | csa103CompI1: Waiting for CSI assignment...
| |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00047 : INFO)
| |
− | csa103CompI1: checkpoint_initialize
| |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00054 : INFO)
| |
− | csa103CompI1: Checkpoint service initialized (handle=0x1)
| |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00056 : INFO)
| |
− | csa103CompI1: Checkpoint opened (handle=0x2)
| |
− | </pre></code>
| |
− | |}
| |
| | | |
− | That is the component that had been killed being restarted.
| + | ==> var/log/csa104CompI1Log.latest <== |
− | | + | Fri Jan 11 14:43:27.403 2013 (SCNodeI0.3302 : csa104Comp.---.---.00156 : INFO) csa104: Sending Message: Msg 142 from csa104CompI1 |
− | CompI0 is moving along just fine, and we see CompI1 come back up. If we then kill CompI0 we see:
| + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00065 : INFO)
| + | |
− | csa103CompI1: Active state requested from state 2
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI1 reading checkpoint
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI1 read checkpoint: seq = 1225 | + | |
− | Mon Jul 14 00:29:45 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1225)
| + | |
− | Mon Jul 14 00:29:46 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1226)
| + | |
| </pre></code> | | </pre></code> |
− | |}
| |
− |
| |
− | Where we can see the CompI0 process die, and CompI1 process read the sequence number from the checkpoint and then take over from where CompI0 left off.
| |
− |
| |
− | <li>To stop csa103 use the following SAFplus Platform Console command.
| |
− | <code><pre>
| |
− | cli[Test:SCNodeI0:CPM]-> amsLockAssignment sg csa103SGI0
| |
− | </pre></code>
| |
− |
| |
− | <li>Now change the state of csa103SGI0 to LockInstantiation and close the SAFplus Platform Console.
| |
− | <code><pre>
| |
− | cli[Test:SCNodeI0:CPM]-> amsLockInstantiation sg csa103SGI0
| |
− | cli[Test:SCNodeI0:CPM] -> end
| |
− | cli[Test:SCNodeI0] -> end
| |
− | cli[Test] -> bye
| |
− | </pre></code>
| |
− | </ol>
| |
− |
| |
− | ===csa203===
| |
− |
| |
− | csa203 demonstrates the usage of SA Forum's Checkpointing service. This sample application does not deviate functionally from csa103. The code differences are due to using SA Forum data types (structures) and APIs , as presented in the following two tables. (Note we have not repeated data types and APIs covered previously.)
| |
− |
| |
− | {| cellspacing="0" border="1" align="center"
| |
− | |+align="bottom"| '''SA Forum Data Types with the SAFplus Platform equivalent'''
| |
− | |- style="color:black;background-color:#ffffaa;" align="center"
| |
− | !SA Forum Data Types
| |
− | !OpenClovis Data Types
| |
− | |-
| |
− | |SaCkptHandleT
| |
− | |ClCkptSvcHdlT
| |
− | |-
| |
− | |SaCkptHandleT
| |
− | |ClCkptHdlT
| |
− | |-
| |
− | |SaCkptSectionIdT
| |
− | |ClCkptSectionIdT
| |
− | |-
| |
− | |SaCkptCheckpointCreationAttributesT
| |
− | |ClCkptCheckpointCreationAttributesT
| |
− | |-
| |
− | |SaCkptSectionCreationAttributesT
| |
− | |ClCkptSectionCreationAttributesT
| |
− | |-
| |
− | |SaCkptIOVectorElementT
| |
− | |ClCkptIOVectorElementT
| |
| |} | | |} |
| | | |
| + | As you can see, the messaging continues through the failure. |
| | | |
| + | But actually this service group was configured to demonstrate some advanced AMF failover semantics as well as messaging. |
| | | |
| + | As can be seen in the logs above, a failover did not happen. Instead, the active was restarted and reassigned active. This occurred because this service group was configured to allow component restarts (in the IDE, look at the isRestartable field in the csa104Comp). |
| | | |
− | {| cellspacing="0" border="1" align="center"
| + | However, the service group was configured so that multiple kills in quick succession will cause a failover. In particular, 2 failures within 10 seconds will cause the fault to be elevated to the service unit level and a further 2 failures within 10 seconds will elevate the failure to the service group level (and cause a fail over). These failure counts and time limits are configured in the service group configuration dialog box. |
− | |+align="bottom"| '''SA Forum APIs with the SAFplus Platform equivalent'''
| + | |
− | |- style="color:black;background-color:#ffffaa;" align="center"
| + | |
− | !SA Forum APIs
| + | |
− | !OpenClovis APIs
| + | |
− | |-
| + | |
− | |saCkptInitialize
| + | |
− | |clCkptInitialize
| + | |
− | |-
| + | |
− | |saCkptCheckpointOpen
| + | |
− | |clCkptCheckpointOpen
| + | |
− | |-
| + | |
− | |saCkptSectionCreate
| + | |
− | |clCkptSectionCreate
| + | |
− | |-
| + | |
− | |saCkptCheckpointClose
| + | |
− | |clCkptCheckpointClose
| + | |
− | |-
| + | |
− | |saCkptFinalize
| + | |
− | |clCkptFinalize
| + | |
− | |-
| + | |
− | |saCkptSectionOverwrite
| + | |
− | |clCkptSectionOverwrite
| + | |
− | |-
| + | |
− | |saCkptCheckpointSynchronize
| + | |
− | |clCkptCheckpointSynchronize
| + | |
− | |-
| + | |
− | |saCkptCheckpointRead
| + | |
− | |clCkptCheckpointRead
| + | |
− | |}
| + | |
| | | |
| + | *By carefully selecting your failure elevation strategy, you direct the AMF to automatically use process restarts and therefore limit the scope of failures* |
| | | |
− | ===How to Run csa103 and What to Observe===
| |
| | | |
− | This sample application can run on all Runtime Hardware Setups. The following, lists which node <code>csa103compI*</code> runs on.
| + | So now, lets kill the process 4 times in a row. This can be done by quickly by watching the "tailed" logs and repeatedly killing the process that receives messages: |
− | <ul>
| + | |
− | <li>'''Runtime Hardware Setup 1.1 and 2.1'''
| + | |
− | <br>csa103compI0 and csa103compI1 will run upon the single node.
| + | |
− | <li>'''Runtime Hardware Setup 1.2 and 2.2'''
| + | |
− | <br>csa103compI0 runs on PayloadNodeI0 and csa103compI1 runs on PayloadNodeI1
| + | |
− | <li>'''Runtime Hardware Setup 1.3 and 2.3'''
| + | |
− | <br>csa103compI2 and csa103compI3 run on SCNodeI0 and SCNodeI1 respectively. csa103compI0 and csa103compI1 run on PayloadNodeI0 and PayloadNodeI1 respectively.
| + | |
− | <br><br>In the four node set up SCNodeI0 and SCNodeI1 output data via <code>/root/asp/var/log/csa103CompI2</code> and <code>/root/asp/var/log/csa103CompI3</code> respectively. Therefore in this case follow the below instructions replacing csa103CompI0 and csa103compI1 with csa103compI2 and csa103compI3 respectively.
| + | |
− | </ul>
| + | |
| | | |
− | We run csa103 very much the same way we run any of the rest of the sample applications: with the SAFplus Platform Console.
| + | # kill 4069 |
− |
| + | # kill 5566 |
− | <ol>
| + | # kill 5595 |
− | <li>First, on the active System Controller move it to state LockAssignment with (Unlock csa203SGI0 instead of csa103SGI0 to run csa203)
| + | # kill 5691 |
− | <code><pre>
| + | |
− | cli[Test]-> setc 1
| + | |
− | cli[Test:SCNodeI0]-> setc cpm
| + | |
− | cli[Test:SCNodeI0:CPM]-> amsLockAssignment sg csa103SGI0
| + | |
− | </pre></code>
| + | |
| | | |
− | The following output is given when you run <code>tail -f</code> on the csa103 log files. For example:
| + | This causes the following logs: |
− | <code><pre>
| + | |
− | # tail -f /root/asp/var/log/csa103CompI0.log
| + | |
− | </pre></code>
| + | |
− | | + | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI0Log.latest
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00030 : INFO)
| + | |
− | Component [csa103CompI0] : PID [15238]. Initializing
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00031 : INFO)
| + | |
− | IOC Address : 0x1
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00032 : INFO)
| + | |
− | IOC Port : 0x81
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00033 : INFO)
| + | |
− | csa103: Instantiated as component instance csa103CompI0.
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00034 : INFO)
| + | |
− | csa103CompI0: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00035 : INFO)
| + | |
− | csa103CompI0: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00036 : INFO)
| + | |
− | csa103CompI0: checkpoint_initialize
| + | |
− | Mon Jul 14 00:01:10 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00043 : INFO)
| + | |
− | csa103CompI0: Checkpoint service initialized (handle=0x1)
| + | |
− | Mon Jul 14 00:01:10 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00045 : INFO)
| + | |
− | csa103CompI0: Checkpoint opened (handle=0x2)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
− | | + | |
− | <code><pre>
| + | |
− | # tail -f /root/asp/var/log/csa103CompI1.log
| + | |
− | </pre></code>
| + | |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
Line 1,046: |
Line 499: |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00030 : INFO)
| + | ==> var/log/csa104CompI1Log.latest <== |
− | Component [csa103CompI1] : PID [15234]. Initializing
| + | Fri Jan 11 14:56:15.783 2013 (SCNodeI0.3302 : csa104Comp.---.---.00540 : INFO) csa104: Sending Message: Msg 526 from csa104CompI1 |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00031 : INFO)
| + | |
− | IOC Address : 0x1
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00032 : INFO)
| + | |
− | IOC Port : 0x80
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00033 : INFO)
| + | |
− | csa103: Instantiated as component instance csa103CompI1. | + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00034 : INFO)
| + | |
− | csa103CompI1: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00035 : INFO)
| + | |
− | csa103CompI1: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00036 : INFO)
| + | |
− | csa103CompI1: checkpoint_initialize
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00043 : INFO)
| + | |
− | csa103CompI1: Checkpoint service initialized (handle=0x1)
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00045 : INFO)
| + | |
− | csa103CompI1: Checkpoint opened (handle=0x2)
| + | |
− | Mon Jul 14 00:01:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00053 : INFO)
| + | |
− | csa103CompI1: Section created
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | <li>Then, unlock the application by running
| + | ==> var/log/csa104CompI0Log.latest <== |
− | <code><pre> | + | Fri Jan 11 14:56:15.784 2013 (SCNodeI0.5691 : csa104Comp.---.---.00018 : INFO) Received Message : Msg 526 from csa104CompI1 |
− | cli[Test:SCNodeI0:CPM]-> amsUnlock sg csa103SGI0
| + | |
− | </pre></code>
| + | |
| | | |
− | In the application log files, you should see
| |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI1Log.latest <== |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI0Log.latest
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00541 : INFO) Component [csa104CompI1] : PID [3302]. CSI Set Received |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:09:08 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00058 : INFO)
| + | |
− | Standby state requested from state 0
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00542 : INFO) CSI Flags : [Target All] |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00543 : INFO) HA state : [Active] |
− | |-
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00544 : INFO) Active Descriptor : |
− | |<code><pre>
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00545 : INFO) Transition Descriptor : [3] |
− | Mon Jul 14 00:09:08 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00064 : INFO)
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00546 : INFO) Active Component : [csa104CompI0] |
− | csa103CompI1: Active state requested from state 0
| + | Fri Jan 11 14:56:17.123 2013 (SCNodeI0.3302 : csa104Comp.---.---.00547 : INFO) csa104: ACTIVE state requested; activating message queue receiver service |
− | Mon Jul 14 00:09:09 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00065 : INFO)
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.3302 : csa104Comp.---.---.00548 : INFO) Received Message : Msg 1 from csa104CompI0 |
− | csa103CompI1: Hello World! (seq=0)
| + | |
− | Mon Jul 14 00:09:10 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1)
| + | |
− | Mon Jul 14 00:09:11 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=2)
| + | |
− | Mon Jul 14 00:09:12 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=3)
| + | |
− | Mon Jul 14 00:09:13 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=4) | + | |
| </pre></code> | | </pre></code> |
− | |} | + | |} |
| | | |
− | <li>Next, find the active csa103 process, the one that's printing the Hello World lines and kill it. To find the process ID issue the following command from a bash shell.
| + | Above, we see that we have triggered a failover and the the standby process is being assigned active (note this is "Comp1" and has the original pid of 3302). |
− | <code><pre>
| + | |
− | # ps -eaf | grep csa103
| + | |
− | </pre></code>
| + | |
− | This should produce an output that looks similar to the following.
| + | |
− | root 17830 15663 0 14:21 ? 00:00:00 csa103Comp -p
| + | |
− | root 17839 15663 0 14:21 ? 00:00:00 csa103Comp -p
| + | |
− | root 18558 16145 0 14:32 pts/4 00:00:00 grep csa103
| + | |
− | Notice the two entries that end with csa103Comp -p. These are our two component processes. The first one is usually the active process. This is the one that we will kill. In this case the process ID is 17830. So to kill the active component you issue the command:
| + | |
| | | |
− | # kill -9 17830
| + | Next, the process is restarted and assigned standby: |
− | | + | |
− | [[File:OpenClovis_Note.png]]If this step does not result in the active component being killed then it is likely that the standby component was killed. In this case simply try killing the other process.
| + | |
− | | + | |
− | After killing the active component you should see lines in the log files like the following:
| + | |
| | | |
| {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" | | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680" |
Line 1,124: |
Line 527: |
| |- | | |- |
| |<code><pre> | | |<code><pre> |
− | Mon Jul 14 00:16:43 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00515 : INFO)
| + | ==> var/log/csa104CompI0Log.latest <== |
− | csa103CompI1: Hello World! (seq=452)
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00001 : INFO) Component [csa104CompI0] : PID [5747]. Initializing |
− | Mon Jul 14 00:16:44 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00516 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=453)
| + | |
− | Mon Jul 14 00:16:45 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00517 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=454)
| + | |
− | Mon Jul 14 00:16:46 2008 (SCNodeI0.15234 : csa103CompEO.---.---.00518 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=455)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00002 : INFO) IOC Address : 0x1 |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /var/log/csa103CompI0Log.latest
| + | |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00065 : INFO)
| + | |
− | csa103CompI0: Active state requested from state 2
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI0 reading checkpoint
| + | |
− | Mon Jul 14 00:16:48 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI0 read checkpoint: seq = 456
| + | |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=456)
| + | |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=457)
| + | |
− | Mon Jul 14 00:16:51 2008 (SCNodeI0.15238 : csa103CompEO.---.---.00070 : INFO)
| + | |
− | csa103CompI0: Hello World! (seq=458)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | Where we can see CompI1 printing 455 and then dieing, where upon CompI0 gets the notice to take over processing, reads the checkpoint and then takes over with seq=456 and so on.
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00003 : INFO) IOC Port : 0x8d |
| | | |
− | Then, in the CompI1 log file we see:
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00004 : INFO) csa102: Instantiated as component instance csa104CompI0. |
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00005 : INFO) csa104CompI0: Waiting for CSI assignment... |
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00006 : INFO) Component [csa104CompI0] : PID [5747]. CSI Set Received |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00007 : INFO) CSI Flags : [Add One] |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00008 : INFO) CSI Name : [csa104CSII] |
− | |-
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00009 : INFO) Name value pairs : |
− | |<code><pre>
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00010 : INFO) HA state : [Standby] |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00044 : INFO)
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00011 : INFO) Standby Descriptor : |
− | csa103: Instantiated as component instance csa103CompI1.
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00012 : INFO) Standby Rank : [1] |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00045 : INFO)
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00013 : INFO) Active Component : [csa104CompI1] |
− | csa103CompI1: Waiting for CSI assignment...
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00014 : INFO) csa104: Standby state requested |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00046 : INFO)
| + | Fri Jan 11 14:56:17.155 2013 (SCNodeI0.5747 : csa104Comp.---.---.00015 : INFO) csa104: Sending Message: Msg 1 from csa104CompI0 |
− | csa103CompI1: Waiting for CSI assignment...
| + | |
− | Mon Jul 14 00:16:49 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00047 : INFO)
| + | |
− | csa103CompI1: checkpoint_initialize
| + | |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00054 : INFO)
| + | |
− | csa103CompI1: Checkpoint service initialized (handle=0x1)
| + | |
− | Mon Jul 14 00:16:50 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00056 : INFO)
| + | |
− | csa103CompI1: Checkpoint opened (handle=0x2)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | That is the component that had been killed being restarted.
| + | ==> var/log/csa104CompI1Log.latest <== |
| + | Fri Jan 11 14:56:17.784 2013 (SCNodeI0.3302 : csa104Comp.---.---.00549 : INFO) csa104: Sending Message: Msg 527 from csa104CompI1 |
| + | Fri Jan 11 14:56:17.784 2013 (SCNodeI0.3302 : csa104Comp.---.---.00550 : INFO) Received Message : Msg 527 from csa104CompI1 |
| | | |
− | CompI0 is moving along just fine, and we see CompI1 come back up. If we then kill CompI0 we see:
| |
| | | |
− | {| cellspacing="0" cellpadding = "0" border="0" align = "center" width="680"
| + | ==> var/log/csa104CompI0Log.latest <== |
− | ! style="color:black;background-color:#ffffaa;" align="center"| /root/asp/var/log/csa103CompI1Log.latest
| + | Fri Jan 11 14:56:19.156 2013 (SCNodeI0.5747 : csa104Comp.---.---.00016 : INFO) csa104: Sending Message: Msg 2 from csa104CompI0 |
− | |-
| + | |
− | |<code><pre>
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00065 : INFO)
| + | |
− | csa103CompI1: Active state requested from state 2
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00066 : INFO)
| + | |
− | csa103CompI1 reading checkpoint
| + | |
− | Mon Jul 14 00:29:44 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00067 : INFO)
| + | |
− | csa103CompI1 read checkpoint: seq = 1225
| + | |
− | Mon Jul 14 00:29:45 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00068 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1225)
| + | |
− | Mon Jul 14 00:29:46 2008 (SCNodeI0.15553 : csa103CompEO.---.---.00069 : INFO)
| + | |
− | csa103CompI1: Hello World! (seq=1226)
| + | |
− | </pre></code>
| + | |
− | |}
| + | |
| | | |
− | Where we can see the CompI0 process die, and CompI1 process read the sequence number from the checkpoint and then take over from where CompI0 left off.
| + | ==> var/log/csa104CompI1Log.latest <== |
| + | Fri Jan 11 14:56:19.156 2013 (SCNodeI0.3302 : csa104Comp.---.---.00551 : INFO) Received Message : Msg 2 from csa104CompI0 |
| | | |
− | <li>To stop csa103 use the following SAFplus Platform Console command.
| + | Fri Jan 11 14:56:19.785 2013 (SCNodeI0.3302 : csa104Comp.---.---.00552 : INFO) csa104: Sending Message: Msg 528 from csa104CompI1 |
− | <code><pre>
| + | Fri Jan 11 14:56:19.785 2013 (SCNodeI0.3302 : csa104Comp.---.---.00553 : INFO) Received Message : Msg 528 from csa104CompI1 |
− | cli[Test:SCNodeI0:CPM]-> amsLockAssignment sg csa103SGI0
| + | </pre></code> |
− | </pre></code>
| + | |} |
− | | + | |
− | <li>Now change the state of csa103SGI0 to LockInstantiation and close the SAFplus Platform Console.
| + | |
− | <code><pre>
| + | |
− | cli[Test:SCNodeI0:CPM]-> amsLockInstantiation sg csa103SGI0
| + | |
− | cli[Test:SCNodeI0:CPM] -> end
| + | |
− | cli[Test:SCNodeI0] -> end
| + | |
− | cli[Test] -> bye
| + | |
− | </pre></code> | + | |
− | </ol>
| + | |
| | | |
− | ===Additional Tests for Runtime Hardware Setup 1.3 and 2.3===
| + | And message passing resumes! |
| | | |
− | Now repeat the experiment till the stage before we kill the active process and follow these steps.
| + | ===Further Investigation=== |
| | | |
− | '''Step A'''
| + | The following are a few changes that you can make to continue investigating the message service: |
− | <ol>
| + | |
− | <li>Stop SAFplus Platform on SCNodeI0 using <code>/etc/init.d/asp stop</code>. Observer the logs <code>/root/asp/var/log/csa103CompI3Log.latest</code> on SCNodeI1. This will become active and start printing the hello world logs as above.
| + | |
− | Note in this case active System Controller SCNode1 is still aware of PayloadNodeI0 and PayloadNodeI1.
| + | |
− | <li>Start looking at the logs for <code>/root/asp/var/log/csa103CompI0Log.latest</code> in PayloadNodeI0. This will print the standby logs as above.
| + | |
− | <li>Stop SAFplus Platform on PayloadNodeI0 and start observing the logs on PayloadNodeI1 in <code>/root/asp/var/log/csa103CompI1Log.latest</code> for standby.
| + | |
− | In this case, you can see, via the active System Controller logs that PayloadNodeI0 is not active whilst PayloadNodeI1 is.
| + | |
− | </ol>
| + | |
| | | |
− | '''Step B''' For Hardware Setup 1.3 only
| + | * Add a new "reply" queue so the active can send the standby messages. |
− | #Repeat A with the slight exception that the blade running SCNodeI0 is yanked out(<code>/etc/init.d/asp zap</code>) instead of gracefully shutting down SAFplus Platform in step 1.
| + | * Use the saMsgMessageSendReceive and saMsgMessageReply functions to implement send/reply semantics. |
| + | * Using the IDE, change the component's fault escalation behavior so that it fails over the first time the process is killed rather then restarting. |
| + | * Investigate message queue groups. |
| | | |
| ===Summary and References=== | | ===Summary and References=== |
Messaging refers to communications between components within the cluster. SAFplus provides a SA-Forum compliant implementation of the messaging service. In one sentence, the messaging service is a reliable packet based communications mechanism which stores and addresses endpoints via cluster-wide message queues that are identified by a well-known name and can be bound to any running process. For more details and API reference please see the SA-Forum spec SA-AIS-MSG-B*.pdf.
csa104 demonstrates the use of the SAFplus Messaging service to provide basic communications during process and node failure.
To increase readability, all messaging code has been isolated into a single module that consists of 2 files: msgFns.c and msgFns.h. These files provide the following APIs.
These APIs constitute the basic operations required by any application that uses messaging; initialize, open, send and receive.
The ACTIVE_COMP_QUEUE defines the well-known name of the messaging queue which will be used in this example. The QUEUE_LENGTH defines the maximum size of the buffer allocated for each priority within a particular queue. There are 4 possible message priorities and the maximum buffer size can be set on a per priority basis.
The messaging library supports either a callback or threaded paradigm. In this example, we will use threading so no callbacks are installed and therefore it is not necessary to periodically call saMsgDispatch(...).
To receive messages from the queue, the application must first open it. The open essentially binds the well-known name to the application process so that senders know where to direct messages. By providing an explicit bind operation (rather then folding it into the library initialize) the API allows the application to choose when it takes ownership of the queue; this could be when it becomes active (or standby) for example, allowing senders to address messages to a single well-known queue name if they need to communicate with the active component.
Designing addresses that represent a concept such as "The currently active transaction server" rather then a physical entity is an extremely powerful design pattern used throughout highly available applications because it means that the sender does not need to access the real-time cluster state to determine this mapping and does not need to handle errors caused by application failure (except perhaps with a simple retry-on-error loop).
Message transmission uses the saMsgMessageSend API which allows the application to pass a message buffer and a bunch of meta-data (such as message version, sender's name, etc) that will be sent to the receiver:
For simplicity, this example creates the destination SaNameT and the SaMsgMessageT structures each time a message is sent. However, for efficiency when sending multiple messages to the same destination it is preferred to pre-create and reuse these objects.
This example will create a dedicated message receiver thread and run the following code within that thread:
This code simply loops "forever" receiving messages and printing the contents of the message. If anything goes wrong it kicks itself out of the message processing loop and closes the message queue. Closing the queue will allow some other application to open it, in effect "taking over" the queue. If an application is killed or disappears due to node death (or other events) the AMF will close all queues opened by the application. This allow the new active application to take control of the queue.
This sample application runs 2 processes on SCNodeI0 (first system controller) in all the hardware setups described at the beginning of this eval guide. While it is certainly possible to run messaging across multiple nodes, this single node configuration makes evaluation simpler.
csa104 is "enabled" by default when SAFplus is started so there is no need to enter the SAFplus Debug Console and change its program state.
The logs show the work assignment occurring with csa104CompI0 as ACTIVE and csa104CompI1 as STANDBY. This causes the csa104CompI1 (standby) component to start sending messages and the csa104CompI0 (active) component to begin receiving them.
Next, find the active csa104 process and kill it. In this case, its the one that's receiving the messages. The process ID is available in the log which is formmated as follows:
So in the log above the active process is pid 3301.
After killing the active component you should see lines in the log files like the following:
As you can see, the messaging continues through the failure.
But actually this service group was configured to demonstrate some advanced AMF failover semantics as well as messaging.
As can be seen in the logs above, a failover did not happen. Instead, the active was restarted and reassigned active. This occurred because this service group was configured to allow component restarts (in the IDE, look at the isRestartable field in the csa104Comp).
However, the service group was configured so that multiple kills in quick succession will cause a failover. In particular, 2 failures within 10 seconds will cause the fault to be elevated to the service unit level and a further 2 failures within 10 seconds will elevate the failure to the service group level (and cause a fail over). These failure counts and time limits are configured in the service group configuration dialog box.
Above, we see that we have triggered a failover and the the standby process is being assigned active (note this is "Comp1" and has the original pid of 3302).
The following are a few changes that you can make to continue investigating the message service: