kubernetes 中zone状态引起的问题

Kubernetes 集群中，在节点异常时(notready)controller-manager中的node_lifecycle_manager会为该节点增加污点，用于驱逐节点的Pod

1
2
3

Taints: 
    node.kubernetes.io/unreachable:NoExecute
     node.kubernetes.io/unreachable:NoSchedule

最近在测试过程中出现，节点notready后，运行在节点的Pod不会被删除，定位发现节点的taint只会存在一个，由于没有Effecf为NoExecute的taint，因此运行在节点的Pod是不会触发驱逐的。

1 2	Taints: node.kubernetes.io/unreachable:NoSchedule

调整kube-controller-manager的日志级别为10，集群中有3个计算节点，当2个计算节点异常时，这两个节点能够被正确打上taint，但是当将第三个节点异常时，节点并不会被打上NoExecute的taint。出现了限流的提示，如下所示：
avatar

继续向上查看日志，可以看到集群进入了PartialDisruption 状态，此时对节点的处理进行了限流，不再处理这个节点
avatar

集群进入statePartialDisruption 状态后，不会再触发节点pod驱逐

看一下源码：目前我们集群没有配置zone信息，默认一个zone，集群节点notready 个数大于55%时，集群进入statePartialDisruption 状态

// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
// The zone is considered:
// - fullyDisrupted if there're no Ready Nodes,
// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
// - normal otherwise
func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) {
	readyNodes := 0
	notReadyNodes := 0
	for i := range nodeReadyConditions {
		if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue {
			readyNodes++
		} else {
			notReadyNodes++
		}
	}
	switch {
	case readyNodes == 0 && notReadyNodes > 0:
		return notReadyNodes, stateFullDisruption
	case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
		return notReadyNodes, statePartialDisruption
	default:
		return notReadyNodes, stateNormal
	}
}

当集群进入statePartialDisruption 状态后，对异常节点的处理调整队列的限流参数

         //判断集群异常比例，进入statePartialDisruption状态
		// We know that there's at least one not-fully disrupted so,
		// we can use default behavior for rate limiters
		for k, v := range nc.zoneStates {
			newState := newZoneStates[k]
			if v == newState {
				continue
			}
			klog.V(0).Infof("Controller detected that zone %v is now in state %v.", k, newState)
			nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState)
			nc.zoneStates[k] = newState
		}
//当集群进入statePartialDisruption状态后，调整zoneNoExecuteTainter队列的处理速度
 case statePartialDisruption:
		if nc.runTaintManager {
			nc.zoneNoExecuteTainter[zone].SwapLimiter(
				nc.enterPartialDisruptionFunc(zoneSize))
		} else {
			nc.zonePodEvictor[zone].SwapLimiter(
				nc.enterPartialDisruptionFunc(zoneSize))
		}       

//不是节点数目大于50的集群，设置QPS为0
 // ReducedQPSFunc returns the QPS for when a the cluster is large make
// evictions slower, if they're small stop evictions altogether.
func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 {
	if int32(nodeNum) > nc.largeClusterThreshold {
		return nc.secondaryEvictionLimiterQPS
	}
	return 0
}       

//使用flowcontrol.NewFakeNeverRateLimiter()，不再处理队列中的元素
var newLimiter flowcontrol.RateLimiter
	if newQPS <= 0 {
		newLimiter = flowcontrol.NewFakeNeverRateLimiter()
	} else {
		newLimiter = flowcontrol.NewTokenBucketRateLimiter(newQPS, EvictionRateLimiterBurst)

		// If we're currently waiting on limiter, we drain the new one - this is a good approach when Burst value is 1
		// TODO: figure out if we need to support higher Burst values and decide on the drain logic, should we keep:
		// - saturation (percentage of used tokens)
		// - number of used tokens
		// - number of available tokens
		// - something else
		if q.limiter.TryAccept() == false {
			newLimiter.TryAccept()
		}
	}

在小规模集群时，如果还希望异常节点的Pod能够被正确驱逐，可以调整kube-controller-manager的参数unhealthy-zone-threshold=1