From 886d4f49b6352924a5de4642a11bbadac8fbf497 Mon Sep 17 00:00:00 2001 From: Valentin Klopfenstein <134367834+klopfenstein-puzzle@users.noreply.github.com> Date: Wed, 12 Mar 2025 17:47:49 +0100 Subject: [PATCH] Retry tests on network errors, improve test docs (#64) --- .github/workflows/test-go.yaml | 28 ++++++++- .github/workflows/test-kubernetes.yaml | 59 ++++++++++++++----- .../workflows/workflow_full-test-suite.yaml | 8 ++- Makefile | 4 +- README.md | 33 ++++++++--- src/main_test.go | 18 +++++- testdata/README.md | 3 +- 7 files changed, 122 insertions(+), 31 deletions(-) diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 3e89bb1..086b054 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -2,6 +2,9 @@ name: Run code tests on: push: + paths: + - '.github/workflows/**' + - 'src/**' workflow_call: secrets: DNSIMPLE_API_TOKEN: @@ -37,11 +40,17 @@ jobs: env: DNSIMPLE_API_TOKEN: ${{ secrets.DNSIMPLE_API_TOKEN }} DNSIMPLE_ZONE_NAME: ${{ secrets.DNSIMPLE_ZONE_NAME }} + shell: 'script -q -e -c "bash {0}"' + timeout-minutes: 15 run: | export TEST_ASSET_KUBE_APISERVER=${{ steps.kubebuilder.outputs.BIN_DIR }}/kube-apiserver export TEST_ASSET_ETCD=${{ steps.kubebuilder.outputs.BIN_DIR }}/etcd export TEST_ASSET_KUBECTL=${{ steps.kubebuilder.outputs.BIN_DIR }}/kubectl export TEST_ZONE_NAME="${DNSIMPLE_ZONE_NAME}." # add trailing dot + + YLW='\033[1;33m' + NC='\033[0m' + echo """apiVersion: v1 kind: Secret metadata: @@ -51,4 +60,21 @@ jobs: token: $DNSIMPLE_API_TOKEN """ > testdata/dnsimple-token.yaml cd src - go test -v . + + # Occasionally, transient network errors can make tests fail + attempt=0 + max_attempts=3 + test_exit_code=0 + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt+1)) + output=$(go test -v . 2>&1 | tee /dev/tty) + test_exit_code=$? + + if echo "$output" | grep -q -e "Temporary failure in name resolution" -e "connection reset by peer" -e "i/o timeout"; then + echo -e "${YLW}Detected transient network error. Retrying... ($attempt/$max_attempts)${NC}" + else + break + fi + done + + exit $test_exit_code diff --git a/.github/workflows/test-kubernetes.yaml b/.github/workflows/test-kubernetes.yaml index 797bd50..f05ce52 100644 --- a/.github/workflows/test-kubernetes.yaml +++ b/.github/workflows/test-kubernetes.yaml @@ -12,26 +12,31 @@ on: jobs: test: runs-on: ubuntu-latest + strategy: + max-parallel: 3 + matrix: + # Always quote versions to prevent int truncation (1.30 -> 1.3) + # https://kubernetes.io/releases + k8s-version: ["1.30", "1.31", "1.32"] + # https://cert-manager.io/docs/releases/ (Always include path version) + cm-version: ["1.16.0", "1.17.0"] steps: - uses: actions/checkout@v4 - - name: Start minikube uses: medyagh/setup-minikube@master with: - kubernetes-version: 1.31.3 - + kubernetes-version: ${{ matrix.k8s-version }} - name: Install cert-manager, patch upstream dns servers, wait for readiness run: | - echo "Target cert-manager version: ${{ vars.TARGET_CERT_MANAGER_VERSION }}" - kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${{ vars.TARGET_CERT_MANAGER_VERSION }}/cert-manager.yaml + echo "Target cert-manager version: ${{ matrix.cm-version }}" + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v${{ matrix.cm-version }}/cert-manager.yaml # Patch cert-manager to use DNSimple's nameservers for faster propagation-checks kubectl patch deployment cert-manager -n cert-manager --type='json' -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--dns01-recursive-nameservers=ns1.dnsimple.com:53"}]' kubectl wait --for=condition=available --timeout=600s deployment/cert-manager-webhook -n cert-manager - - name: Install cert-manager-webhook-dnsimple, wait for readiness env: DNSIMPLE_API_TOKEN: ${{ secrets.DNSIMPLE_API_TOKEN }} @@ -48,7 +53,7 @@ jobs: helm -n cert-manager list - max_wait_time_seconds=600 + max_wait_time_seconds=800 sleep_between_iterations=10 start=$(date +%s) @@ -57,7 +62,7 @@ jobs: echo "" echo "Awaiting succesful deployment for max ${max_wait_time_seconds} seconds or until $(date --date="@$end")" while [ $(date +%s) -le $end ]; do - echo "[i] New iteration at $(date +%s)" + echo "[i] New iteration at $(date)" kubectl -n cert-manager get po if [ $(kubectl -n cert-manager get po | grep Crash | wc -l) -gt 0 ]; then @@ -101,11 +106,10 @@ jobs: """ > certificate.yaml kubectl apply -f certificate.yaml - - name: Assert that the DNS record was created env: DNSIMPLE_ZONE_NAME: ${{ secrets.DNSIMPLE_ZONE_NAME }} - timeout-minutes: 10 + timeout-minutes: 10 run: | while true; do if nslookup -type=TXT _acme-challenge.gh-action-test.$DNSIMPLE_ZONE_NAME ns1.dnsimple.com; then @@ -114,9 +118,36 @@ jobs: sleep 30 done - + # This step can time out, but it timing out doesn't necessarily mean that the webhook is not working. + # Timeouts mainly happen due to the environment of the runner and/or parallelism, thus such occurrences will simply be dismissed as warnings. - name: Check the certificate status run: | - kubectl wait --for=condition=ready --timeout=600s certificate/dnsimple-test - # this should not be necessary since the certificate is usually ready once the DNS record is propagated - kubectl get certificate dnsimple-test -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep True + max_wait_time_seconds=300 + end=$(( $(date +%s) + $max_wait_time_seconds )) + start=$(date +%s) + + sleep 5 + while [ $(date +%s) -le $end ]; do + OUT_CRT=$(kubectl get certificate/dnsimple-test -o jsonpath='{.status.conditions}') + OUT_CRQ=$(kubectl get CertificateRequest -o json) + + echo "Certificate:" + echo "$OUT_CRT" + + echo "CertificateRequest:" + echo "$OUT_CRQ" | jq .items[0].status.conditions + + if [ $(echo "$OUT_CRT" | grep -iE "Failed|Denied" | wc -l) -gt 0 ]; then + echo "::Error title=Certificate resource errored::The certificate ressource has an error" + exit 1 + fi + + if [ $(kubectl get certificate dnsimple-test -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') == "True" ]; then + echo "Certificate is ready after $(( $(date +%s) - $start )) seconds" + exit 0 + fi + sleep 20 + echo -e "\n[i] New iteration at $(date)" + done + + echo "::warning title=Certificate timed out::Have timed out waiting for certificate" diff --git a/.github/workflows/workflow_full-test-suite.yaml b/.github/workflows/workflow_full-test-suite.yaml index 97132b9..13facbc 100644 --- a/.github/workflows/workflow_full-test-suite.yaml +++ b/.github/workflows/workflow_full-test-suite.yaml @@ -1,12 +1,14 @@ name: Run full test suite on: - push: - branches: - - master + # To prevent this time intesive suite from running redundandtly, it will only run on PRs. + # If a PR is merged, it also creates a push and thus this workflow unnecessarily runs again. pull_request: branches: - master + paths: + - '.github/workflows/**' + - 'src/**' jobs: code-test: diff --git a/Makefile b/Makefile index 1c05642..47fd1e7 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ GO ?= $(shell which go) OS ?= $(shell $(GO) env GOOS) ARCH ?= $(shell $(GO) env GOARCH) -KUBE_VERSION=1.25.0 + +# Available versions: https://storage.googleapis.com/kubebuilder-tools +KUBE_VERSION=$(shell curl -s https://storage.googleapis.com/kubebuilder-tools | grep -oP 'kubebuilder-tools-\K[0-9]+\.[0-9]+\.[0-9]+' | sort -V | tail -n 1 || echo "1.30.0") # required by go tests export TEST_ASSET_ETCD=../_test/kubebuilder/etcd diff --git a/README.md b/README.md index 6f821f2..5b4a580 100644 --- a/README.md +++ b/README.md @@ -75,26 +75,34 @@ The Helm chart accepts the following values: All cert-manager webhooks have to pass the DNS01 provider conformance testing suite. ### Pull requests -Prerequisites for PRs are implemented as GitHub-actions. All tests should pass before a PR is merged: -- the `cert-manager` conformance suite is run with provided kubebuilder fixtures -- a custom test suite running on a working k8s cluster (using `minikube`) is executed as well +Prerequisites for PRs are implemented as GitHub-actions. All tests should pass before a PR is merged: +- The `cert-manager` conformance suite is run with provided kubebuilder fixtures +- A custom test suite running on a working k8s cluster (using `minikube`) is executed as well ### Local testing #### Test suite -You can also run tests locally, as specified in the `Makefile`: +Tests can be run locally according to the `Makefile`: -1. Set-up `testdata/` according to its [README][3]. - - `dnsimple-token.yaml` should be filled with a valid token (for either the sandbox or production environment) - - `dnsimple.env` should contain the remaining environment variables (non sensitive) -2. Execute the test suite: +1. Set up `testdata/` according to its [README][3] + - `dnsimple-token.yaml` should be filled with a valid token (for either the sandbox or production environment) + +2. Set env var `TEST_ZONE_NAME`, adding a trailing dot + - `export TEST_ZONE_NAME="."` + +3. Execute the test suite: ```bash make test ``` + +> [!NOTE] +> Kubebuilder will always use the latest version available. + #### In-cluster testing 1. Install cert-manager: ```bash kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.3/cert-manager.yaml ``` + 2. Install the webhook: ```bash helm install cert-manager-webhook-dnsimple \ @@ -103,6 +111,7 @@ You can also run tests locally, as specified in the `Makefile`: --set clusterIssuer.staging.enabled=true \ ./charts/cert-manager-webhook-dnsimple ``` + 3. Test away... You can create a sample certificate to ensure the webhook is working correctly: ```bash kubectl apply -f - <<`. **We recommend using a specific version tag for production deployments instead.** +Every push to `master` or on a pull-request triggers the upload of a new docker image to the GitHub Container Registry (this is configured through github actions). +These images should **not be considered stable** and are tagged with `commit-`. **We recommend using a specific version tag for production deployments instead.** Tagged images are considered stable, these are the ones referenced by the default helm values. diff --git a/src/main_test.go b/src/main_test.go index 774b339..83bab6b 100644 --- a/src/main_test.go +++ b/src/main_test.go @@ -1,22 +1,36 @@ package main import ( + "fmt" "os" + "strings" "testing" dns "github.com/cert-manager/cert-manager/test/acme" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" ) -var ( - zone = os.Getenv("TEST_ZONE_NAME") +const ( testdata_dir = "../testdata" ) +var ( + zone = os.Getenv("TEST_ZONE_NAME") +) + func TestRunsSuite(t *testing.T) { + log.SetLogger(zap.New(zap.UseFlagOptions(&zap.Options{}))) + // The manifest path should contain a file named config.json that is a // snippet of valid configuration that should be included on the // ChallengeRequest passed as part of the test cases. + // Ensure trailing dot + if !strings.HasSuffix(zone, ".") { + zone = fmt.Sprintf("%s.", zone) + } + fixture := dns.NewFixture(&dnsimpleDNSProviderSolver{}, dns.SetResolvedZone(zone), dns.SetAllowAmbientCredentials(false), diff --git a/testdata/README.md b/testdata/README.md index 6ff2abd..1567fcc 100644 --- a/testdata/README.md +++ b/testdata/README.md @@ -6,4 +6,5 @@ Copy the `dnsimple-token.yaml.example` example file removing the `.example` suff $ cp dnsimple-token.yaml{.example,} ``` -Replace the placeholders for the API token in `dnsimple-token.yaml`. The API token can be generated in your DNSimple account settings in the automation tab. +Replace the placeholders for the API token in `dnsimple-token.yaml`. +The API token can be generated in your DNSimple account settings in the automation tab. \ No newline at end of file