Configuration Example
To install EGS on the cluster, first clone the egs-installation repository. The cloned repository includes the script to install EGS. The script requires a YAML configuration file to define various parameters and settings for the installation process.
warning
Do not copy the example YAML configuration directly. Hash characters (#) used in comments may not be properly interpreted.
Always refer to the actual egs-only-config.yaml
file available in the repository for accurate configuration.
Navigate to the cloned repository and look for the configuration YAML file called egs-only-config.yaml
.
The following is an example egs-only-config.yaml
file. The parameter descriptions are provided with a comment.
########################### MANDATORY PARAMETERS ####################################################################
# Global image pull secret settings
global_image_pull_secret:
repository: "https://index.docker.io/v1/" # Docker registry URL
username: "" # Global Docker registry username
password: "" # Global Docker registry password
# Kubeconfig settings
global_kubeconfig: "" # Relative path to the global kubeconfig file (must be in the script directory) - Mandatory
global_kubecontext: "" # Global kubecontext to use - Mandatory
use_global_context: true # If true, use the global kubecontext for all operations by default
# Enable or disable specific stages of the installation
enable_install_controller: true # Enable the installation of the Kubeslice controller
enable_install_ui: true # Enable the installation of the Kubeslice UI
enable_install_worker: true # Enable the installation of Kubeslice workers
# Enable or disable the installation of additional applications(prometheus, gpu-operator, postgresql)
enable_install_additional_apps: false # Set to true to enable additional apps installation
# Enable custom applications
# Set this to true if you want to allow custom applications to be deployed.
# This is specifically useful for enabling NVIDIA driver installation on your nodes.
enable_custom_apps: true
# Command execution settings
# Set this to true to allow the execution of commands for configuring NVIDIA MIG.
# This includes modifications to the NVIDIA ClusterPolicy and applying node labels
# based on the MIG strategy defined in the YAML (e.g., single or mixed strategy).
run_commands: false
#########################################################################################################################
########################### OPTIONAL CONFIGURATION PARAMETERS ###########################################################
# Project and cluster registration settings
enable_project_creation: true # Enable project creation in Kubeslice
enable_cluster_registration: true # Enable cluster registration in Kubeslice
enable_prepare_worker_values_file: true # Prepare the worker values file for Helm charts
enable_autofetch_egsagent_endpoint_and_token: true # if False then, skip update values of egsAgent token and endpoint in values file.
# Global monitoring endpoint settings
global_auto_fetch_endpoint: false # Enable automatic fetching of monitoring endpoints globally
global_grafana_namespace: egs-monitoring # Namespace where Grafana is globally deployed
global_grafana_service_type: ClusterIP # Service type for Grafana (accessible only within the cluster)
global_grafana_service_name: prometheus-grafana # Service name for accessing Grafana globally
global_prometheus_namespace: egs-monitoring # Namespace where Prometheus is globally deployed
global_prometheus_service_name: prometheus-kube-prometheus-prometheus # Service name for accessing Prometheus globally
global_prometheus_service_type: ClusterIP # Service type for Prometheus (accessible only within the cluster)
# Precheck options
precheck: true # Run general prechecks before starting the installation
kubeslice_precheck: true # Run specific prechecks for Kubeslice components
# Global installation verification settings
verify_install: false # Enable verification of installations globally
verify_install_timeout: 600 # Timeout for global installation verification (in seconds)
skip_on_verify_fail: true # If set to true, skip steps where verification fails, otherwise exit on failure
# Base path settings
base_path: "" # If left empty, the script will use the relative path to the script as the base path
# Helm repository settings
use_local_charts: true # Use local Helm charts instead of fetching them from a repository
local_charts_path: "charts" # Path to the directory containing local Helm charts
global_helm_repo_url: "" # URL for the global Helm repository (if not using local charts)
global_helm_username: "" # Username for accessing the global Helm repository
global_helm_password: "" # Password for accessing the global Helm repository
readd_helm_repos: true # Re-add Helm repositories even if they are already present
#### Kubeslice Controller Installation Settings ####
kubeslice_controller_egs:
skip_installation: false # Do not skip the installation of the controller
use_global_kubeconfig: true # Use global kubeconfig for the controller installation
specific_use_local_charts: true # Override to use local charts for the controller
kubeconfig: "" # Path to the kubeconfig file specific to the controller, if empty, uses the global kubeconfig
kubecontext: "" # Kubecontext specific to the controller; if empty, uses the global context
namespace: "kubeslice-controller" # Kubernetes namespace where the controller will be installed
release: "egs-controller" # Helm release name for the controller
chart: "kubeslice-controller-egs" # Helm chart name for the controller
#### Inline Helm Values for the Controller Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for the images
namespaceConfig: # user can configure labels or annotations that EGS Controller namespaces should have
labels: {}
annotations: {}
kubeTally:
enabled: false # Enable KubeTally in the controller
#### Postgresql Connection Configuration for Kubetally ####
postgresSecretName: kubetally-db-credentials # Secret name in kubeslice-controller namespace for PostgreSQL credentials created by install, all the below values must be specified
# then a secret will be created with specified name.
# alternatively you can make all below values empty and provide a pre-created secret name with below connection details format
postgresAddr: "kt-postgresql.kt-postgresql.svc.cluster.local" # Change this Address to your postgresql endpoint
postgresPort: 5432 # Change this Port for the PostgreSQL service to your values
postgresUser: "postgres" # Change this PostgreSQL username to your values
postgresPassword: "postgres" # Change this PostgreSQL password to your value
postgresDB: "postgres" # Change this PostgreSQL database name to your value
postgresSslmode: disable # Change this SSL mode for PostgreSQL connection to your value
prometheusUrl: http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090 # Prometheus URL for monitoring
kubeslice:
controller:
endpoint: "" # Endpoint of the controller API server; auto-fetched if left empty
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the installation
verify_install: false # Verify the installation of the controller
verify_install_timeout: 30 # Timeout for the controller installation verification (in seconds)
skip_on_verify_fail: true # If verification fails, do not skip the step
#### Troubleshooting Settings ####
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
#### Kubeslice Controller Installation Settings ####
#### Kubeslice UI Installation Settings ####
kubeslice_ui_egs:
skip_installation: false # Do not skip the installation of the UI
use_global_kubeconfig: true # Use global kubeconfig for the UI installation
kubeconfig: "" # Path to the kubeconfig file specific to the UI, if empty, uses the global kubeconfig
kubecontext: "" # Kubecontext specific to the UI; if empty, uses the global context
namespace: "kubeslice-controller" # Kubernetes namespace where the UI will be installed
release: "egs-ui" # Helm release name for the UI
chart: "kubeslice-ui-egs" # Helm chart name for the UI
#### Inline Helm Values for the UI Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for the UI images
kubeslice:
prometheus:
url: http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090 # Prometheus URL for monitoring
uiproxy:
service:
type: ClusterIP # Service type for the UI proxy
## if type selected to NodePort then set nodePort value if required
# nodePort:
# port: 443
# targetPort: 8443
labels:
app: kubeslice-ui-proxy
annotations: {}
ingress:
## If true, ui‑proxy Ingress will be created
enabled: false
## Port on the Service to route to
servicePort: 443
## Ingress class name (e.g. "nginx"), if you’re using a custom ingress controller
className: ""
hosts:
- host: ui.kubeslice.com # replace with your FQDN
paths:
- path: / # base path
pathType: Prefix # Prefix | Exact
## TLS configuration (you must create these Secrets ahead of time)
tls: []
# - hosts:
# - ui.kubeslice.com
# secretName: uitlssecret
annotations: []
## Extra labels to add onto the Ingress object
extraLabels: {}
egsCoreApis:
enabled: true # Enable EGS core APIs for the UI
service:
type: ClusterIP # Service type for the EGS core APIs
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the UI installation
verify_install: false # Verify the installation of the UI
verify_install_timeout: 50 # Timeout for the UI installation verification (in seconds)
skip_on_verify_fail: true # If UI verification fails, do not skip the step
#### Chart Source Settings ####
specific_use_local_charts: true # Override to use local charts for the UI
#### Kubeslice Worker Installation Settings ####
kubeslice_worker_egs:
- name: "worker-1" # Worker name
use_global_kubeconfig: true # Use global kubeconfig for this worker
kubeconfig: "" # Path to the kubeconfig file specific to the worker, if empty, uses the global kubeconfig
kubecontext: "" # Kubecontext specific to the worker; if empty, uses the global context
skip_installation: false # Do not skip the installation of the worker
specific_use_local_charts: true # Override to use local charts for this worker
namespace: "kubeslice-system" # Kubernetes namespace for this worker
release: "egs-worker" # Helm release name for the worker
chart: "kubeslice-worker-egs" # Helm chart name for the worker
#### Inline Helm Values for the Worker Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for worker images
egs:
prometheusEndpoint: "http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090" # Prometheus endpoint
grafanaDashboardBaseUrl: "http://<grafana-lb>/d/Oxed_c6Wz" # Grafana dashboard base URL
egsAgent:
secretName: egs-agent-access
agentSecret:
endpoint: ""
key: ""
metrics:
insecure: true # Allow insecure connections for metrics
kserve:
enabled: true # Enable KServe for the worker
kserve: # KServe chart options
controller:
gateway:
domain: kubeslice.com
ingressGateway:
className: "nginx" # Ingress class name for the KServe gateway
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the worker installation
verify_install: true # Verify the installation of the worker
verify_install_timeout: 60 # Timeout for the worker installation verification (in seconds)
skip_on_verify_fail: false # Do not skip if worker verification fails
#### Troubleshooting Settings ####
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
#### Local Monitoring Endpoint Settings (Optional) ####
# local_auto_fetch_endpoint: true # Enable automatic fetching of monitoring endpoints
# local_grafana_namespace: egs-monitoring # Namespace where Grafana is deployed
# local_grafana_service_name: prometheus-grafana # Service name for accessing Grafana
# local_grafana_service_type: ClusterIP # Service type for Grafana (accessible only within the cluster)
# local_prometheus_namespace: egs-monitoring # Namespace where Prometheus is deployed
# local_prometheus_service_name: prometheus-kube-prometheus-prometheus # Service name for accessing Prometheus
# local_prometheus_service_type: ClusterIP # Service type for Prometheus (accessible only within the cluster)
#### Define Projects ####
projects:
- name: "avesha" # Name of the Kubeslice project
username: "admin" # Username for accessing the Kubeslice project
#### Define Cluster Registration ####
cluster_registration:
- cluster_name: "worker-1" # Name of the cluster to be registered
project_name: "avesha" # Name of the project to associate with the cluster
#### Telemetry Settings ####
telemetry:
enabled: true # Enable telemetry for this cluster
endpoint: "http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090" # Telemetry endpoint
telemetryProvider: "prometheus" # Telemetry provider (Prometheus in this case)
#### Geo-Location Settings ####
geoLocation:
cloudProvider: "" # Cloud provider for this cluster (e.g., GCP)
cloudRegion: "" # Cloud region for this cluster (e.g., us-central1)
#### Define Additional Applications to Install ####
additional_apps:
- name: "gpu-operator" # Name of the application
skip_installation: false # Do not skip the installation of the GPU operator
use_global_kubeconfig: true # Use global kubeconfig for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "egs-gpu-operator" # Namespace where the GPU operator will be installed
release: "gpu-operator" # Helm release name for the GPU operator
chart: "gpu-operator" # Helm chart name for the GPU operator
repo_url: "https://helm.ngc.nvidia.com/nvidia" # Helm repository URL for the GPU operator
version: "v24.9.1" # Version of the GPU operator to install
specific_use_local_charts: true # Use local charts for this application
#### Inline Helm Values for GPU Operator ####
inline_values:
hostPaths:
driverInstallDir: "/home/kubernetes/bin/nvidia"
toolkit:
installDir: "/home/kubernetes/bin/nvidia"
cdi:
enabled: true
default: true
# mig:
# strategy: "mixed"
# migManager: # Enable to ensure that the node reboots and can apply the MIG configuration.
# env:
# - name: WITH_REBOOT
# value: "true"
driver:
enabled: false
helm_flags: "--debug" # Additional Helm flags for this application's installation
verify_install: false # Verify the installation of the GPU operator
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip the step if verification fails
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
- name: "prometheus" # Name of the application
skip_installation: false # Do not skip the installation of Prometheus
use_global_kubeconfig: true # Use global kubeconfig for Prometheus
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "egs-monitoring" # Namespace where Prometheus will be installed
release: "prometheus" # Helm release name for Prometheus
chart: "kube-prometheus-stack" # Helm chart name for Prometheus
repo_url: "https://prometheus-community.github.io/helm-charts" # Helm repository URL for Prometheus
version: "v45.0.0" # Version of the Prometheus stack to install
specific_use_local_charts: true # Use local charts for this application
values_file: "" # Path to an external values file, if any
#### Inline Helm Values for Prometheus ####
inline_values:
prometheus:
service:
type: ClusterIP # Service type for Prometheus
prometheusSpec:
storageSpec: {} # Placeholder for storage configuration
additionalScrapeConfigs:
- job_name: tgi
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container_name
- job_name: gpu-metrics
scrape_interval: 1s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- egs-gpu-operator
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: drop
regex: .*-node-feature-discovery-master
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
grafana:
enabled: true # Enable Grafana
grafana.ini:
auth:
disable_login_form: true
disable_signout_menu: true
auth.anonymous:
enabled: true
org_role: Viewer
service:
type: ClusterIP # Service type for Grafana
persistence:
enabled: false # Disable persistence
size: 1Gi # Default persistence size
helm_flags: "--debug" # Additional Helm flags for this application's installation
verify_install: false # Verify the installation of Prometheus
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip the step if verification fails
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
- name: "postgresql" # Name of the application
skip_installation: false # Do not skip the installation of PostgreSQL
use_global_kubeconfig: true # Use global kubeconfig for PostgreSQL
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "kt-postgresql" # Namespace where PostgreSQL will be installed
release: "kt-postgresql" # Helm release name for PostgreSQL
chart: "postgresql" # Helm chart name for PostgreSQL
repo_url: "oci://registry-1.docker.io/bitnamicharts/postgresql" # Helm repository URL for PostgreSQL
version: "16.2.1" # Version of the PostgreSQL chart to install
specific_use_local_charts: true # Use local charts for this application
values_file: "" # Path to an external values file, if any
#### Inline Helm Values for PostgreSQL ####
inline_values:
auth:
postgresPassword: "postgres" # Explicit password (use if not relying on `existingSecret`)
username: "postgres" # Explicit username (fallback if `existingSecret` is not used)
password: "postgres" # Password for PostgreSQL (optional)
database: "postgres" # Default database to create
primary:
persistence:
enabled: false # Disable persistent storage for PostgreSQL
size: 10Gi # Size of the Persistent Volume Claim
helm_flags: "--wait --debug" # Additional Helm flags for this application's installation
verify_install: true # Verify the installation of PostgreSQL
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: false # Do not skip if verification fails
#### Define Custom Applications and Associated Manifests ####
manifests:
- appname: gpu-operator-quota # Name of the custom application
manifest: "" # URL or path to the manifest file; if empty, inline YAML is used
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: | # Inline YAML content for this custom application
apiVersion: v1
kind: ResourceQuota
metadata:
name: gpu-operator-quota
spec:
hard:
pods: 100 # Maximum number of pods
scopeSelector:
matchExpressions:
- operator: In
scopeName: PriorityClass # Define scope for PriorityClass
values:
- system-node-critical
- system-cluster-critical
use_global_kubeconfig: true # Use global kubeconfig for this application
skip_installation: false # Do not skip the installation of this application
verify_install: false # Verify the installation of this application
verify_install_timeout: 30 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if verification fails
namespace: egs-gpu-operator # Namespace for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
- appname: nvidia-driver-installer # Name of the custom application
manifest: "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml"
# URL to the manifest file
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: null # Inline YAML content for this application
use_global_kubeconfig: true # Use global kubeconfig for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
skip_installation: false # Do not skip the installation of this application
verify_install: false # Verify the installation of this application
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if verification fails
namespace: kube-system # Namespace for this application
#### Define Commands to Execute ####
commands:
- use_global_kubeconfig: true # Use global kubeconfig for these commands
kubeconfig: "" # Path to the kubeconfig file specific to these commands
kubecontext: "" # Kubecontext specific to these commands; uses global context if empty
skip_installation: false # Do not skip the execution of these commands
verify_install: false # Verify the execution of these commands
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if command verification fails
namespace: kube-system # Namespace context for these commands
command_stream: | # Commands to execute
kubectl create namespace egs-gpu-operator --dry-run=client -o yaml | kubectl apply -f - || true
kubectl get nodes || true
kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | .metadata.name' | xargs -I {} kubectl label nodes {} gke-no-default-nvidia-gpu-device-plugin=true cloud.google.com/gke-accelerator=true --overwrite || true
kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | .metadata.name' | xargs -I {} sh -c "echo {}; kubectl get node {} -o=jsonpath='{.metadata.labels}' | jq ." || true
kubectl get clusterpolicies.nvidia.com/cluster-policy --no-headers || true
#### Troubleshooting Mode Settings ####
enable_troubleshoot:
enabled: false # Global enable troubleshooting mode for additional logs and checks
#### Resource Types to Troubleshoot ####
resource_types:
- pods
- deployments
- daemonsets
- statefulsets
- replicasets
- jobs
- configmaps
- secrets
- services
- serviceaccounts
- roles
- rolebindings
- crds
#### API Groups to Troubleshoot ####
api_groups:
- controller.kubeslice.io
- worker.kubeslice.io
- inventory.kubeslice.io
- aiops.kubeslice.io
- networking.kubeslice.io
- monitoring.coreos.com
#### Upload Log Settings ####
upload_logs:
enabled: false # Enable log upload functionality
command: | # Command to execute for log upload
#### List of Required Binaries ####
required_binaries:
- yq # YAML processor
- helm # Helm package manager
- jq # JSON processor
- kubectl # Kubernetes command-line tool
#### Node Labeling Settings ####
add_node_label: false # Enable node labeling during installation
# Version of the input configuration file
version: "1.13.0"