Configuration Example
To install EGS on the cluster, first clone the egs-installation repository. The cloned repository includes the script to install EGS. The script requires a YAML configuration file to define various parameters and settings for the installation process.
warning
Do not copy the example YAML configuration directly. Hash characters (#) used in comments may not be properly interpreted.
Always refer to the actual egs-installer-config.yaml
file available in the repository for accurate configuration.
Navigate to the cloned repository and look for the configuration YAML file called egs-installer-config.yaml
.
The following is an example egs-installer-config.yaml
file. The parameter descriptions are provided with a comment.
# Base path to the root directory of your cloned repository
base_path: "" # If left empty, the script will use the relative path to the script as the base path
# Precheck options
precheck: true # Run general prechecks before starting the installation
kubeslice_precheck: true # Run specific prechecks for Kubeslice components
# Global installation verification settings
verify_install: true # Enable verification of installations globally
verify_install_timeout: 600 # Timeout for global installation verification (in seconds)
skip_on_verify_fail: false # If set to true, skip steps where verification fails, otherwise exit on failure
# Helm repository settings
use_local_charts: true # Use local Helm charts instead of fetching them from a repository
local_charts_path: "charts" # Path to the directory containing local Helm charts
global_helm_repo_url: "" # URL for the global Helm repository (if not using local charts)
global_helm_username: "" # Username for accessing the global Helm repository
global_helm_password: "" # Password for accessing the global Helm repository
readd_helm_repos: true # Re-add Helm repositories even if they are already present
# List of required binaries for the installation process
required_binaries:
- yq # YAML processor
- helm # Helm package manager
- jq # JSON processor
- kubectl # Kubernetes command-line tool
# Global image pull secret settings
global_image_pull_secret:
repository: "https://index.docker.io/v1/" # Docker registry URL
username: "" # Global Docker registry username
password: "" # Global Docker registry password
# Node labeling settings
add_node_label: true # Enable node labeling during installation
# Kubeconfig settings
global_kubeconfig: "" # Path to the global kubeconfig file (used if no specific kubeconfig is provided)
global_kubecontext: "" # Global kubecontext to use; if empty, the default context will be used
use_global_context: true # If true, use the global kubecontext for all operations by default
# Enable or disable specific stages of the installation
enable_prepare_worker_values_file: true # Prepare the worker values file for Helm charts
enable_install_controller: true # Enable the installation of the Kubeslice controller
enable_install_ui: true # Enable the installation of the Kubeslice UI
enable_install_worker: true # Enable the installation of Kubeslice workers
# Kubeslice controller installation settings
kubeslice_controller_egs:
skip_installation: false # Do not skip the installation of the controller
use_global_kubeconfig: true # Use global kubeconfig for the controller installation
specific_use_local_charts: true # Override to use local charts for the controller
kubeconfig: "" # Path to the kubeconfig file specific to the controller
kubecontext: "" # Kubecontext specific to the controller; if empty, uses the global context
namespace: "kubeslice-controller" # Kubernetes namespace where the controller will be installed
release: "kubeslice-controller-release" # Helm release name for the controller
chart: "kubeslice-controller-egs" # Helm chart name for the controller
inline_values: # Inline Helm values for the controller chart
kubeslice:
controller:
endpoint: "" # Endpoint of the controller API server; auto-fetched if left empty
migration:
minio:
install: "false" # Do not install MinIO during migration
helm_flags: "--timeout 10m --atomic" # Additional Helm flags for the installation
verify_install: true # Verify the installation of the controller
verify_install_timeout: 30 # Timeout for the controller installation verification (in seconds)
skip_on_verify_fail: false # If verification fails, do not skip the step
# Kubeslice UI installation settings
kubeslice_ui_egs:
skip_installation: false # Do not skip the installation of the UI
use_global_kubeconfig: true # Use global kubeconfig for the UI installation
namespace: "kubeslice-controller" # Kubernetes namespace where the UI will be installed
release: "kubeslice-ui" # Helm release name for the UI
chart: "kubeslice-ui-egs" # Helm chart name for the UI
helm_flags: "--atomic" # Additional Helm flags for the UI installation
verify_install: true # Verify the installation of the UI
verify_install_timeout: 50 # Timeout for the UI installation verification (in seconds)
skip_on_verify_fail: false # If UI verification fails, do not skip the step
specific_use_local_charts: true # Override to use local charts for the UI
# Kubeslice worker installation settings
kubeslice_worker_egs:
- name: "worker-1"
use_global_kubeconfig: true # Use global kubeconfig for this worker
skip_installation: false # Do not skip the installation of the worker
specific_use_local_charts: true # Override to use local charts for this worker
namespace: "kubeslice-system" # Kubernetes namespace for this worker
release: "kubeslice-worker1-release" # Helm release name for the worker
chart: "kubeslice-worker-egs" # Helm chart name for the worker
inline_values: # Inline Helm values for the worker chart
cluster:
name: worker-1
endpoint: <worker-cluster-endpoint> # Cluster Endpoint Accessible from Controller Cluster
kubesliceNetworking:
enabled: false # Disable Kubeslice networking for this worker
egs:
prometheusEndpoint: "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090" # Prometheus endpoint
grafanaDashboardBaseUrl: "http://grafana-test" # Grafana dashboard base URL
metrics:
insecure: true # Allow insecure connections for metrics
helm_flags: "--atomic" # Additional Helm flags for the worker installation
verify_install: true # Verify the installation of the worker
verify_install_timeout: 60 # Timeout for the worker installation verification (in seconds)
skip_on_verify_fail: false # Do not skip if worker verification fails
# Project and cluster registration settings
enable_project_creation: true # Enable project creation in Kubeslice
enable_cluster_registration: true # Enable cluster registration in Kubeslice
# Define projects
projects:
- name: "avesha"
username: "admin" # Username for accessing the Kubeslice project
# Define cluster registration
cluster_registration:
- cluster_name: "worker-1"
project_name: "avesha"
telemetry:
enabled: true # Enable telemetry for this cluster
endpoint: "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090" # Telemetry endpoint
telemetryProvider: "prometheus" # Telemetry provider (Prometheus in this case)
geoLocation:
cloudProvider: "GCP" # Cloud provider for this cluster (e.g., GCP)
cloudRegion: "us-central1" # Cloud region for this cluster (e.g., us-central1)
# Enable or disable the installation of additional applications
enable_install_additional_apps: true # Set to true to enable additional apps installation
# Define additional applications to install
additional_apps:
- name: "gpu-operator"
skip_installation: false # Do not skip the installation of the GPU operator
use_global_kubeconfig: true # Use global kubeconfig for this application
namespace: "gpu-operator" # Namespace where the GPU operator will be installed
release: "gpu-operator-release" # Helm release name for the GPU operator
chart: "gpu-operator" # Helm chart name for the GPU operator
repo_url: "https://helm.ngc.nvidia.com/nvidia" # Helm repository URL for the GPU operator
version: "v24.6.0" # Version of the GPU operator to install
specific_use_local_charts: true # Use local charts for this application
inline_values: # Inline Helm values for the GPU operator chart
hostPaths:
driverInstallDir: "/home/kubernetes/bin/nvidia"
toolkit:
installDir: "/home/kubernetes/bin/nvidia"
cdi:
enabled: true
default: true
driver:
enabled: false
helm_flags: "--wait" # Additional Helm flags for this application's installation
verify_install: true # Verify the installation of the GPU operator
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: false # Do not skip if verification fails
- name: "prometheus"
skip_installation: false # Do not skip the installation of Prometheus
use_global_kubeconfig: true # Use global kubeconfig for Prometheus
namespace: "monitoring" # Namespace where Prometheus will be installed
release: "prometheus" # Helm release name for Prometheus
chart: "kube-prometheus-stack" # Helm chart name for Prometheus
repo_url: "https://prometheus-community.github.io/helm-charts" # Helm repository URL for Prometheus
version: "v45.0.0" # Version of the Prometheus stack to install
specific_use_local_charts: true # Use local charts for this application
values_file: "" # Path to an external values file, if any
inline_values: # Inline Helm values for Prometheus
prometheus:
service:
type: LoadBalancer
port: 32270
prometheusSpec:
additionalScrapeConfigs:
- job_name: tgi
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container_name
static_configs:
- targets: ["llm-inference.demo.svc.cluster.local:80"]
- job_name: gpu-metrics
scrape_interval: 1s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- gpu-operator
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: drop
regex: .*-node-feature-discovery-master
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
grafana:
enabled: true
grafana.ini:
auth:
disable_login_form: true
disable_signout_menu: true
auth.anonymous:
enabled: true
org_role: Viewer
service:
type: LoadBalancer
persistence:
enabled: true
size: 1Gi
helm_flags: "--wait" # Additional Helm flags for this application's installation
verify_install: true # Verify the installation of Prometheus
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: false # Do not skip if verification fails
# Enable custom applications
enable_custom_apps: true # Set to true to enable custom apps
# Define custom applications and their associated manifests
manifests:
- appname: gpu-operator-quota
manifest: "" # URL or path to the manifest file; if empty, inline YAML is used
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: | # Inline YAML content for this custom application
apiVersion: v1
kind: ResourceQuota
metadata:
name: gpu-operator-quota
spec:
hard:
pods: 100
scopeSelector:
matchExpressions:
- operator: In
scopeName: PriorityClass
values:
- system-node-critical
- system-cluster-critical
use_global_kubeconfig: true # Use global kubeconfig for this application
skip_installation: false # Do not skip the installation of this application
verify_install: true # Verify the installation of this application
verify_install_timeout: 30 # Timeout for verification (in seconds)
skip_on_verify_fail: false # Do not skip if verification fails
namespace: gpu-operator # Namespace for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
- appname: nvidia-driver-installer
manifest: https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: null # Inline YAML content for this application
use_global_kubeconfig: true # Use global kubeconfig for this application
skip_installation: false # Do not skip the installation of this application
verify_install: true # Verify the installation of this application
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if verification fails
namespace: kube-system # Namespace for this application
# Command execution settings
run_commands: true # Enable the execution of commands defined in the YAML
# Define commands to execute
commands:
- use_global_kubeconfig: true # Use global kubeconfig for these commands
skip_installation: false # Do not skip the execution of these commands
verify_install: true # Verify the execution of these commands
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if command verification fails
namespace: kube-system # Namespace context for these commands
command_stream: | # Commands to execute
kubectl get nodes
kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | .metadata.name' | xargs -I {} kubectl label nodes {} gke-no-default-nvidia-gpu-device-plugin=true --overwrite