-
Notifications
You must be signed in to change notification settings - Fork 32
CLOUDP-306333: Remove monitoring hosts on downscaling #652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
46aefdf
1995665
9c914a5
a01ff0e
918bf9e
a88fb9b
fce8fcf
0c6ef10
5e03870
8a9ce8d
10b5b5f
f0f7932
e5bf6ea
84dd908
c2947fb
aa35ff1
cf3990a
a57ce56
f71290f
7a70702
d6a2337
c79ff89
db928da
f3f9593
ce3bd97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| --- | ||
| kind: fix | ||
| date: 2025-12-16 | ||
| --- | ||
|
|
||
| * Fix an issue to ensure that hosts are consistently removed from Ops Manager monitoring during MongoDB and AppDB scale-down events. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,8 +63,11 @@ type MockedOmConnection struct { | |
| backupAgentConfig *BackupAgentConfig | ||
| monitoringAgentConfig *MonitoringAgentConfig | ||
| controlledFeature *controlledfeature.ControlledFeature | ||
| // hosts are used for both automation agents and monitoring endpoints. | ||
| // They are necessary for emulating "agents" are ready behavior as operator checks for hosts for agents to exist | ||
| // In Ops Manager, "hosts" and "automation agents" are two different things: | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without fixing the mock (separation between agents types), some tests for sharded cluster were failing after the changes:
|
||
| // - hostResults: the monitored hosts shown in the OM UI (via /hosts API) | ||
| // - agentHostnameMap: the automation agents that ping OM (via /agents/AUTOMATION API) | ||
| // When we remove a host from monitoring (e.g. during scale down), the automation | ||
| // agent on that host doesn't just disappear - it keeps running until the pod is deleted. | ||
| hostResults *host.Result | ||
| agentHostnameMap map[string]struct{} | ||
|
|
||
|
|
@@ -168,6 +171,11 @@ func NewEmptyMockedOmConnection(ctx *OMContext) Connection { | |
| func NewMockedOmConnection(d Deployment) *MockedOmConnection { | ||
| connection := MockedOmConnection{deployment: d} | ||
| connection.hostResults = buildHostsFromDeployment(d) | ||
| // Also populate agentHostnameMap so the mock knows which agents are "registered" | ||
| connection.agentHostnameMap = make(map[string]struct{}) | ||
| for _, h := range connection.hostResults.Results { | ||
| connection.agentHostnameMap[h.Hostname] = struct{}{} | ||
| } | ||
| connection.BackupConfigs = make(map[string]*backup.Config) | ||
| connection.BackupHostClusters = make(map[string]*backup.HostCluster) | ||
| connection.SnapshotSchedules = make(map[string]*backup.SnapshotSchedule) | ||
|
|
@@ -377,8 +385,16 @@ func (oc *MockedOmConnection) ReadUpdateAutomationConfig(modifyACFunc func(ac *A | |
| return err | ||
| } | ||
|
|
||
| func (oc *MockedOmConnection) AddHost(host host.Host) error { | ||
| oc.hostResults.Results = append(oc.hostResults.Results, host) | ||
| func (oc *MockedOmConnection) AddHost(h host.Host) error { | ||
| // Generate an ID if not set (like the real OM API would do) | ||
| if h.Id == "" { | ||
| if oc.agentHostnameMap == nil { | ||
| oc.agentHostnameMap = map[string]struct{}{} | ||
| } | ||
| h.Id = strconv.Itoa(len(oc.hostResults.Results)) | ||
| oc.agentHostnameMap[h.Hostname] = struct{}{} | ||
| } | ||
| oc.hostResults.Results = append(oc.hostResults.Results, h) | ||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -473,10 +489,12 @@ func (oc *MockedOmConnection) ReadAutomationAgents(pageNum int) (Paginated, erro | |
| return oc.ReadAutomationAgentsFunc(pageNum) | ||
| } | ||
|
|
||
| // We use agentHostnameMap here, not hostResults. In real OM, the /agents/AUTOMATION | ||
| // endpoint returns agents based on their heartbeats, independent of the /hosts endpoint. | ||
| results := make([]AgentStatus, 0) | ||
| for _, r := range oc.hostResults.Results { | ||
| for hostname := range oc.agentHostnameMap { | ||
| results = append(results, | ||
| AgentStatus{Hostname: r.Hostname, LastConf: time.Now().Add(time.Second * -1).Format(time.RFC3339)}) | ||
| AgentStatus{Hostname: hostname, LastConf: time.Now().Add(time.Second * -1).Format(time.RFC3339)}) | ||
| } | ||
|
|
||
| return AutomationAgentStatusResponse{AutomationAgents: results}, nil | ||
|
|
@@ -496,9 +514,8 @@ func (oc *MockedOmConnection) RemoveHost(hostID string) error { | |
| } | ||
| } | ||
| oc.hostResults = &host.Result{Results: toKeep} | ||
| oc.agentHostnameMap = util.TransformToMap(oc.hostResults.Results, func(obj host.Host, idx int) (string, struct{}) { | ||
| return obj.Hostname, struct{}{} | ||
| }) | ||
| // We don't touch agentHostnameMap here - in real OM, removing a host from monitoring | ||
| // doesn't unregister its automation agent. The agent keeps pinging until the pod dies. | ||
| return nil | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -712,15 +712,10 @@ func (r *ReconcileMongoDbMultiReplicaSet) saveLastAchievedSpec(ctx context.Conte | |
| return annotations.SetAnnotations(ctx, &mrs, annotationsToAdd, r.client) | ||
| } | ||
|
|
||
| // updateOmDeploymentRs performs OM registration operation for the replicaset. So the changes will be finally propagated | ||
| // to automation agents in containers | ||
| func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, mrs mdbmultiv1.MongoDBMultiCluster, agentCertPath, tlsCertPath, internalClusterCertPath string, isRecovering bool, log *zap.SugaredLogger) error { | ||
| reachableHostnames := make([]string, 0) | ||
|
|
||
| clusterSpecList, err := mrs.GetClusterSpecItems() | ||
| if err != nil { | ||
| return err | ||
| } | ||
| // getAllHostnames returns the hostnames of all replicas across all clusters. | ||
| // Unhealthy clusters are ignored when reachableClustersOnly is set to true | ||
| func (r *ReconcileMongoDbMultiReplicaSet) getAllHostnames(mrs mdbmultiv1.MongoDBMultiCluster, clusterSpecList mdb.ClusterSpecList, reachableClustersOnly bool, log *zap.SugaredLogger) ([]string, error) { | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I extracted this logic that was in "updateDeploymentRs" into a subfunction |
||
| var hostnames []string | ||
| failedClusterNames, err := mrs.GetFailedClusterNames() | ||
| if err != nil { | ||
| // When failing to retrieve the list of failed clusters we proceed assuming there are no failed clusters, | ||
|
|
@@ -729,15 +724,33 @@ func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Conte | |
| } | ||
| for _, spec := range clusterSpecList { | ||
| hostnamesToAdd := dns.GetMultiClusterProcessHostnames(mrs.Name, mrs.Namespace, mrs.ClusterNum(spec.ClusterName), spec.Members, mrs.Spec.GetClusterDomain(), mrs.Spec.GetExternalDomainForMemberCluster(spec.ClusterName)) | ||
| if stringutil.Contains(failedClusterNames, spec.ClusterName) { | ||
|
|
||
| if stringutil.Contains(failedClusterNames, spec.ClusterName) && reachableClustersOnly { | ||
| log.Debugf("Skipping hostnames %+v as they are part of the failed cluster %s ", hostnamesToAdd, spec.ClusterName) | ||
| continue | ||
| } | ||
| if mrs.GetClusterSpecByName(spec.ClusterName) == nil { | ||
| log.Debugf("Skipping hostnames %+v as they are part of a cluster not known by the operator %s ", hostnamesToAdd, spec.ClusterName) | ||
| continue | ||
| } | ||
| reachableHostnames = append(reachableHostnames, hostnamesToAdd...) | ||
| hostnames = append(hostnames, hostnamesToAdd...) | ||
| } | ||
|
|
||
| return hostnames, nil | ||
| } | ||
|
|
||
| // updateOmDeploymentRs performs OM registration operation for the replicaset. So the changes will be finally propagated | ||
| // to automation agents in containers | ||
| func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, mrs mdbmultiv1.MongoDBMultiCluster, agentCertPath, tlsCertPath, internalClusterCertPath string, isRecovering bool, log *zap.SugaredLogger) error { | ||
| // This clusterSpecList reflects the desired state for this reconciliation, not the final one (the resource spec) | ||
| clusterSpecList, err := mrs.GetClusterSpecItems() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| reachableHostnames, err := r.getAllHostnames(mrs, clusterSpecList, true, log) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| err = agents.WaitForRsAgentsToRegisterSpecifiedHostnames(conn, reachableHostnames, log) | ||
|
|
@@ -814,6 +827,17 @@ func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Conte | |
| if err := om.WaitForReadyState(conn, reachableProcessNames, isRecovering, log); err != nil && !isRecovering { | ||
| return err | ||
| } | ||
|
|
||
| // The hostnames we get here are the ones for the current reconciliation. Not the final state. | ||
| // Note that we include unhealthy clusters (we don't want to remove them from monitoring) | ||
| allHostNames, err := r.getAllHostnames(mrs, clusterSpecList, false, log) | ||
| if err != nil && !isRecovering { | ||
| return err | ||
| } | ||
| if err := host.RemoveUndesiredMonitoringHosts(conn, allHostNames, log); err != nil { | ||
| log.Warnf("failed to remove stale host(s) from Ops Manager monitoring: %s", err.Error()) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!