Skip to main content
Version: 1.1.0

Explore the Helm Chart

Let us explore the helm chart that is shipped as values.yaml file in the installation package.

warning

Do not copy the following YAML configuration directly. Hash characters (#) used in comments may or may not be properly interpreted. Always refer to the actual values.yaml file available in the repository for accurate configuration.

# ========================================
# OBLIQ SRE AGENT - HELM VALUES CONFIGURATION
# ========================================
#
# Full comprehensive values for obliq-sre-agent umbrella chart
# Exhaustive configuration for production deployments with all options
#
# 📖 USAGE GUIDE:
# 1. REQUIRED: Set global.env.openai.OPENAI_API_KEY for core AI services
# 2. REQUIRED: Provide kubeconfig via --set-file global.kubeconfig.content=./kubeconfig
# 3. REQUIRED: Create registry secret for image pulling (contact support@aveshasystems.com)
# 4. OPTIONAL: Enable additional services by setting <service>.enabled=true
# 5. OPTIONAL: Configure external integrations (AWS, DataDog, Slack, Jira) by setting credentials
#
# 🔧 COMMON INSTALLATION PATTERNS:
# - Minimal: Only core services with OpenAI (default configuration)
# - AWS Integration: Enable aws-mcp, cloudwatch-mcp with AWS credentials
# - Full Integration: Enable all services with all external credentials
#
# 💡 PARAMETER OVERRIDE EXAMPLES:
# helm install obliq-sre-agent ./obliq-sre-agent/ \
# --set-file global.kubeconfig.content=./kubeconfig \
# --set global.env.openai.OPENAI_API_KEY="sk-..." \
# --set aws-mcp.enabled=true \
# --set global.env.aws.AWS_ACCESS_KEY_ID="..." \
# --set global.env.aws.AWS_SECRET_ACCESS_KEY="..."

# Centralized configuration
x-imageRegistry: &imageRegistry "avesha.azurecr.io"

# Centralized image paths for all services
x-images:
backend: &backendImage "agents/release/backend"
avesha-unified-ui: &aveshaUnifiedUiImage "agents/release/obliq-ai-sre"
orchestrator: &orchestratorImage "agents/release/orchestrator"
rca-agent: &rcaAgentImage "agents/release/rca-agent"
anomaly-detection: &anomalyDetectionImage "agents/release/anomaly-detection"
auto-remediation: &autoRemediationImage "agents/release/auto-remediation"
incident-manager: &incidentManagerImage "agents/release/incident-manager"
service-graph-engine: &serviceGraphEngineImage "agents/release/service-graph-engine"
active-inventory: &activeInventoryImage "agents/release/active-inventory"
infra-agent: &infraAgentImage "agents/release/infra-agent"
aws-mcp: &awsMcpImage "agents/release/aws-mcp"
k8s-mcp: &k8sMcpImage "agents/release/k8s-mcp"
prometheus-mcp: &prometheusMcpImage "agents/release/prometheus-mcp"
loki-mcp: &lokiMcpImage "agents/release/loki-mcp"
neo4j-mcp: &neo4jMcpImage "agents/release/neo4j-mcp"
cloudwatch-mcp: &cloudwatchMcpImage "agents/release/cloudwatch-mcp"
kubernetes-events-ingester: &kubernetesEventsIngesterImage "agents/release/kubernetes-events-ingester"
slack-ingester: &slackIngesterImage "agents/release/slack-ingester"
aws-ec2-cloudwatch-alarms: &awsEc2CloudwatchAlarmsImage "agents/release/aws-ec2-cloudwatch-alarms"

# Common service configuration templates
x-commonConfig: &commonServiceConfig
replicaCount: 1
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 500m
memory: 512Mi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 3
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
volumes:
persistent:
enabled: false
storageClass: ""
size: 20Gi
mountPath: /data
accessMode: ReadWriteOnce
livenessProbe:
enabled: true
path: /health
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
readinessProbe:
enabled: true
path: /ready
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
successThreshold: 1
secrets:
database:
DB_PASSWORD: "admin123"
api:
API_KEY: "admin123"
JWT_SECRET: "admin123"
# external:
# # DEPRECATED: Use global.env.aws.AWS_SECRET_ACCESS_KEY instead to avoid conflicts
# AWS_SECRET_ACCESS_KEY: "admin123"

# Common ingress annotations
x-commonIngressAnnotations: &commonIngressAnnotations
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"

# ========================================
# GLOBAL CONFIGURATION
# ========================================
# Global configuration for all sub-charts in the Obliq SRE Agent platform
global:
# ----------------------------------------
# Image Registry Configuration
# ----------------------------------------
# Global image registry disabled to prevent third-party charts from using it
# In-house services use template defaults ({{ .Values.global.imageRegistry | default "avesha.azurecr.io" }})
# Set to null to force explicit registry configuration per service
imageRegistry: null

# ----------------------------------------
# Kubernetes Configuration
# ----------------------------------------
# Global kubeconfig configuration for Kubernetes API access
# Required for k8s-mcp service and other Kubernetes integrations
kubeconfig:
# Kubeconfig file content (provide via --set-file global.kubeconfig.content=./kubeconfig)
# Example: helm install --set-file global.kubeconfig.content=./kubeconfig
content: ""

# ----------------------------------------
# Container Registry Authentication
# ----------------------------------------
# Global image pull secrets configuration
# Required for pulling images from private Avesha registry (avesha.azurecr.io)
imagePullSecrets:
# Use existing secret by name (must be created beforehand)
# kubectl create secret docker-registry registry --docker-server=avesha.azurecr.io --docker-username=... --docker-password=...
- name: registry

# Image pull secret creation configuration (alternative to manual creation)
imagePullSecretConfig:
# Enable automatic creation of image pull secret (requires credentials)
create:
enabled: false # Set to true if you want chart to create the secret
name: "registry"
# Docker registry credentials (contact support@aveshasystems.com for access)
registry: "avesha.azurecr.io"
username: "" # Provide via --set or values override
password: "" # Provide via --set or values override
# Use existing pre-created image pull secret (if provided)
# When existing.enabled is true, the chart will use the existing secret
# instead of creating a new one or using the default global imagePullSecrets
existing:
enabled: false
name: ""

# ----------------------------------------
# Global Secret Management
# ----------------------------------------
# Global secret configuration for storing sensitive environment variables
globalSecret:
# Enable creation of new global secret containing all environment variables
create:
enabled: true # Recommended: let chart manage the global secret
name: "obliq-sre-agent-global-secret"
# Use existing pre-created secret (advanced use case)
# When existing.enabled is true, the chart will use the existing secret
# instead of creating a new one. Make sure the existing secret contains
# all required environment variables defined in global.env sections below.
existing:
enabled: false
name: ""

# ----------------------------------------
# Infrastructure Configuration
# ----------------------------------------
# Global storage class for persistent volumes (leave empty for cluster default)
storageClass: ""

# Global namespace where all services will be deployed
namespace: "avesha"

# ========================================
# ENVIRONMENT VARIABLES
# ========================================
# Global environment variables shared across all services
# These are automatically injected into all service containers
env:
# ----------------------------------------
# Common Runtime Configuration
# ----------------------------------------
common:
# Application environment settings
NODE_ENV: "production" # Node.js environment mode
LOG_LEVEL: "INFO" # Global logging level (DEBUG, INFO, WARN, ERROR)
LOGURU_LEVEL: "INFO" # Python Loguru logging level
TZ: "UTC" # Timezone for all services
ENVIRONMENT: "production" # Deployment environment identifier
CLUSTER_NAME: "obliq-cluster" # Kubernetes cluster identifier
# NAMESPACE - Now dynamically generated by helper functions

# Automation and execution settings
AUTOMATIC_EXECUTION_ENABLED: "true" # Enable automated remediation actions
KUBECONFIG: "/etc/kubeconfig/config" # Path to mounted kubeconfig file
DEBUG: "false" # Enable debug mode across services

# SSL/TLS Configuration for internal service communication
# Disabled for internal cluster communication (services behind ClusterIP)
SSL_VERIFY: "false" # Disable SSL verification for internal calls
TLS_VERIFY: "false" # Disable TLS verification for internal calls
DISABLE_SSL_VERIFICATION: "true" # Global SSL verification disable
# PORT is service-specific and defined in each service's env.app section

# ----------------------------------------
# Database Configuration
# ----------------------------------------
# Internal database credentials for Neo4j and MongoDB
# These databases are deployed as part of the Obliq platform
database:
# Neo4j Graph Database (stores relationships and topology data)
NEO4J_USER: "neo4j" # Neo4j username
NEO4J_PASSWORD: "admin123" # Neo4j password (change for production)
NEO4J_AUTH: "neo4j/admin123" # Neo4j auth string format
NEO4J_DATABASE: "neo4j" # Neo4j database name
# Neo4j URLs are dynamically generated by helper functions

# MongoDB Document Database (stores metrics and infrastructure data)
MONGO_ROOT_USERNAME: "admin" # MongoDB root username
MONGO_ROOT_PASSWORD: "admin123" # MongoDB root password (change for production)
MONGODB_DATABASE: "infra_db" # MongoDB database name for infrastructure data
MONGODB_USERNAME: "admin" # MongoDB application username
MONGODB_PASSWORD: "admin123" # MongoDB application password (change for production)

# ----------------------------------------
# AWS Configuration
# ----------------------------------------
# AWS credentials and configuration for cloud integrations
# Required for: aws-mcp, cloudwatch-mcp, aws-ec2-cloudwatch-alarms services
aws:
# IAM Role ARNs for different AWS integrations
AWS_ROLE_ARN_AWS_MCP: "" # IAM role for aws-mcp service (EC2, S3, etc.)
AWS_ROLE_ARN_EC2_CLOUDWATCH_ALARMS: "" # IAM role for CloudWatch alarms service

# AWS API credentials (alternative to IAM roles)
AWS_ACCESS_KEY_ID: "" # AWS access key ID (provide via --set)
AWS_SECRET_ACCESS_KEY: "" # AWS secret access key (provide via --set)
AWS_REGION: "us-east-1" # Default AWS region

# AWS MCP service credentials (internal authentication)
AWS_MCP_USERNAME: "admin" # Username for AWS MCP service
AWS_MCP_PASSWORD: "admin123" # Password for AWS MCP service

# EKS service account token (for EKS IRSA authentication)
AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/eks.amazonaws.com/serviceaccount/token"

# Metrics collection settings
METRICS_SAMPLING_INTERVAL_SECONDS: "300" # Metrics collection interval (5 minutes)

# ----------------------------------------
# AI/ML Service Configuration
# ----------------------------------------
# OpenAI Configuration (REQUIRED for core AI services)
# Used by: rca-agent, anomaly-detection, auto-remediation
openai:
OPENAI_API_KEY: "" # OpenAI API key (provide via --set)

# ----------------------------------------
# Observability Integrations
# ----------------------------------------
# Prometheus Configuration (for prometheus-mcp service)
prometheus:
PROMETHEUS_URL: "http://prometheus:9090" # External Prometheus server URL
PROMETHEUS_USER: "admin" # Prometheus username (if auth enabled)
PROMETHEUS_PASSWORD: "admin123" # Prometheus password (if auth enabled)
PROMETHEUS_MCP_USERNAME: "admin" # Username for Prometheus MCP service
PROMETHEUS_MCP_PASSWORD: "admin123" # Password for Prometheus MCP service

# Loki Configuration (for loki-mcp service)
loki:
LOKI_URL: "http://loki:3100" # External Loki server URL
LOKI_TOKEN: "" # Loki authentication token (if required)
LOKI_USERNAME: "" # Loki username (if auth enabled)
LOKI_PASSWORD: "" # Loki password (if auth enabled)

# ----------------------------------------
# External Service Integrations
# ----------------------------------------
# Jira Configuration (OPTIONAL - for incident management integration)
jira:
JIRA_EMAIL: "" # Jira user email (provide via --set)
JIRA_API_TOKEN: "" # Jira API token (provide via --set)
JIRA_BASE_URL: "https://avesha.atlassian.net" # Jira instance URL
JIRA_PROJECT_KEY: "" # Jira project key for creating tickets
JIRA_PAT: "" # Jira Personal Access Token (alternative to API token)

# Slack Configuration (OPTIONAL - for notifications and slack-ingester)
slack:
SLACK_WEBHOOK_URL: "" # Slack webhook URL for notifications
SLACK_BOT_TOKEN: "" # Slack bot token (xoxb-...) for slack-ingester


# Note: Service URLs are dynamically generated by helper functions
# and passed via globalSecretEnv template for internal communication

# ----------------------------------------
# MCP (Model Context Protocol) Configuration
# ----------------------------------------
# MCP servers provide AI model context for different data sources
# These define how AI services connect to various infrastructure components
mcp:
# Individual MCP server endpoints
MCP_SERVERS_AWS_EC2: "AWS_EC2:http://aws-mcp:8080" # AWS EC2 integration
MCP_SERVERS_K8S: "K8s:http://k8s-mcp:8080" # Kubernetes integration
MCP_SERVERS_PROMETHEUS: "PROMETHEUS:http://prometheus-mcp:8041" # Prometheus metrics
MCP_SERVERS_NEO4J: "NEO4J:http://neo4j-mcp:8080" # Neo4j graph data
MCP_SERVERS_LOKI: "LOKI:http://loki-mcp:8089" # Loki logs

# Combined MCP server configurations
MCP_SERVERS_FULL: "AWS_EC2:http://aws-mcp:8080,K8s:http://k8s-mcp:8080,PROMETHEUS:http://prometheus-mcp:8041,NEO4J:http://neo4j-mcp:8080,LOKI:http://loki-mcp:8089"
MCP_SERVERS: "AWS_EC2:http://aws-mcp:8080,K8s:http://k8s-mcp:8080,PROMETHEUS:http://prometheus-mcp:8041,NEO4J:http://neo4j-mcp:8080,LOKI:http://loki-mcp:8089"

# Neo4j MCP specific configuration
NEO4J_MCP_URL: "http://neo4j-mcp:8080" # Neo4j MCP service URL
NEO4J_MCP_USERNAME: "admin" # Neo4j MCP username
NEO4J_MCP_PASSWORD: "admin123" # Neo4j MCP password


# ----------------------------------------
# DataDog Integration (OPTIONAL)
# ----------------------------------------
# Service Graph Engine Configuration (for service-graph-engine service)
# Used to pull service topology from DataDog APM
sg:
APM_PROVIDER: "datadog" # APM provider (currently only DataDog supported)
UPDATE_INTERVAL_SECONDS: "86400" # Service graph update interval (24 hours)
SG_APM_PROVIDER: "datadog" # SG prefixed APM provider (for compatibility)
SG_UPDATE_INTERVAL_SECONDS: "120" # SG prefixed update interval in seconds
DD_API_KEY: "" # DataDog API key (provide via --set)
DD_APP_KEY: "" # DataDog application key (provide via --set)
DD_SITE: "us5.datadoghq.com" # DataDog site (us1, us3, us5, eu1, etc.)
DD_ENVIRONMENTS: "production" # DataDog environments to monitor

# ----------------------------------------
# Observability/Telemetry Configuration
# ----------------------------------------
# OpenTelemetry configuration for metrics and tracing
observability:
OTEL_COLLECTOR_ENDPOINT: "opentelemetry-collector:4317" # OTLP gRPC endpoint
OTEL_COLLECTOR_HTTP_ENDPOINT: "opentelemetry-collector:4318" # OTLP HTTP endpoint
OTEL_EXPORTER_OTLP_ENDPOINT: "http://opentelemetry-collector:4317" # Full OTLP URL
OTEL_EXPORTER_OTLP_INSECURE: "true" # Allow insecure connections

# ----------------------------------------
# Kubernetes Integration
# ----------------------------------------
kubernetes:
kubeconfigFile: "files/kubeconfig" # Path to kubeconfig file (for reference)





# ========================================
# SERVICES CONFIGURATION
# ========================================
# Configuration for all Obliq SRE Agent services
#
#
# 🔧 Service Categories:
# - Core Services: Always enabled (databases, backend, ui, orchestrator)
# - AI Services: Enable based on use case (rca-agent, anomaly-detection, auto-remediation)
# - MCP Services: Enable based on integrations needed (aws-mcp, k8s-mcp, prometheus-mcp, etc.)
# - Optional Services: Enable based on external integrations (slack-ingester, service-graph-engine)

# ----------------------------------------
# Database Services
# ----------------------------------------
# Graph Database - stores relationships and topology data for AI/SRE services
neo4j:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "neo4j"
neo4j:
name: "neo4j"
# Password must match the value in global.env.database.NEO4J_PASSWORD
password: "admin123"

# Service configuration - SECURITY: Use ClusterIP instead of LoadBalancer
# Neo4j should NEVER be exposed externally for security reasons
# All access should go through the Obliq SRE Agent backend services
services:
neo4j:
enabled: true
spec:
type: ClusterIP # SECURITY: Internal access only, not exposed externally
admin:
enabled: true
spec:
type: ClusterIP # SECURITY: Admin interface internal only

# Volume configuration - required by Neo4j chart
volumes:
data:
mode: "defaultStorageClass"
defaultStorageClass:
requests:
storage: 10Gi

# Document Database - stores application data and configurations
mongodb:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "mongodb"

# Authentication values must match global.env.database configuration
auth:
enabled: true
rootPassword: "admin123"
username: "admin"
database: "infra_db"

# MongoDB deployment configuration - StatefulSet for persistence
architecture: standalone
useStatefulSet: true

# Persistence with PVC retention
persistence:
enabled: true
size: 10Gi
resourcePolicy: "keep"

# Observability Data Collection - collects and processes telemetry data
opentelemetry-collector:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "opentelemetry-collector"

# Completely override global section to prevent in-house registry usage
global:
imageRegistry: null # Disable global registry override

# OpenTelemetry Collector Service Configuration for Production Environment

# Image configuration - Using official OpenTelemetry image (not affected by global registry)
image:
repository: "otel/opentelemetry-collector"
tag: "0.131.0"
pullPolicy: IfNotPresent
# Note: This uses the official OpenTelemetry image directly from Docker Hub
# and will NOT be overridden by the global imageRegistry setting

# Collector configuration
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318

processors:
batch:
timeout: 5s
send_batch_size: 10
memory_limiter:
check_interval: 1s
limit_percentage: 80
spike_limit_percentage: 25

exporters:
debug:
verbosity: detailed
otlp:
endpoint: "http://localhost:4317"
tls:
insecure: true
otlphttp:
endpoint: "http://active-inventory:8065/api/"
compression: gzip # Recommended for async
timeout: 60s # Give async processing time
encoding: json
tls:
insecure: true

service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [debug, otlp]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [debug, otlp]
logs:
receivers: [otlp]
processors: [batch]
exporters: [debug, otlphttp]

# Service configuration
service:
type: ClusterIP

# Deployment configuration
mode: "deployment"

# Resource limits
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
cpu: 20m
memory: 100Mi

# Security context (container-level)
securityContext:
runAsNonRoot: true
runAsUser: 1000
# Note: fsGroup belongs in podSecurityContext, not securityContext

# Pod security context
podSecurityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000

# Service account
serviceAccount:
create: true
name: "opentelemetry-collector"

# Network policy
networkPolicy:
enabled: false

# Pod disruption budget
podDisruptionBudget:
enabled: false

# Horizontal pod autoscaler
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 1
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80

# Core Application Services
# Main API Server - provides REST API and core backend functionality
backend:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "backend"
image:
repository: *backendImage
tag: "1.1.1"
pullPolicy: IfNotPresent

# Use common service configuration
<<: *commonServiceConfig



# Override common config with service-specific values
livenessProbe:
enabled: false # Disabled: app doesn't expose /health endpoint
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1

readinessProbe:
enabled: false # Disabled: app doesn't expose /ready endpoint
path: /ready
port: 8000
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
successThreshold: 1

service:
port: 8000

ingress:
enabled: true
className: "nginx"
annotations:
<<: *commonIngressAnnotations
hosts:
- host: api.avesha.local
paths:
- path: /
pathType: Prefix
tls:
- secretName: api-tls
hosts:
- api.avesha.local

env:
app:
AGENT_TYPE: "backend"
PORT: "8000"
INFRA_AGENT_HOST: infra-agent
INFRA_AGENT_PORT: 8051

# Web User Interface - provides frontend dashboard and user interface
avesha-unified-ui:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "avesha-unified-ui"
image:
repository: *aveshaUnifiedUiImage
tag: "1.1.1"
pullPolicy: IfNotPresent

# Use common service configuration
<<: *commonServiceConfig



# Override common config with service-specific values
livenessProbe:
enabled: true
path: /health
port: 80
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1

readinessProbe:
enabled: true
path: /ready
port: 80
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
successThreshold: 1

service:
port: 80

ingress:
enabled: true
className: "nginx"
annotations:
<<: *commonIngressAnnotations
hosts:
- host: ui.avesha.local
paths:
- path: /
pathType: Prefix
tls:
- secretName: ui-tls
hosts:
- ui.avesha.local

env:
app:
AGENT_TYPE: "avesha_unified_ui"
PORT: "3000"

# Workflow Orchestration Engine - manages and coordinates AI/SRE workflows
orchestrator:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "orchestrator"
image:
repository: *orchestratorImage
tag: "1.1.1"
pullPolicy: IfNotPresent

# Use common service configuration
<<: *commonServiceConfig



# Override common config with service-specific values
livenessProbe:
enabled: false # Disabled in favor of individual chart configuration

readinessProbe:
enabled: false # Configured in individual chart with correct port

service:
port: 8060 # Fixed to match actual service port

env:
app:
AGENT_TYPE: "orchestrator"
PORT: "8060" # Service-specific port override

# Note: OPENAI_API_KEY and service URLs are provided via globalSecretEnv
# Note: Common environment variables (NODE_ENV, LOG_LEVEL, TZ) are provided via globalSecretEnv

# AI/ML Services
# Root Cause Analysis Engine - AI-powered analysis and recommendations
rca-agent:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "rca-agent"
image:
repository: *rcaAgentImage
tag: "1.1.1"
service:
port: 8062
env:
app:
AGENT_TYPE: "rca_agent"
PORT: "8062"
# Note: OPENAI_API_KEY, MCP_SERVERS, and service URLs are provided via globalSecretEnv


# Anomaly Detection Engine - AI-powered anomaly detection and alerting
anomaly-detection:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "anomaly-detection"
image:
repository: *anomalyDetectionImage
tag: "1.1.1"
service:
port: 8061
env:
app:
AGENT_TYPE: "anomaly_detection"
PORT: "8061"


# Automated Remediation Engine - AI-powered automated fixes and responses
auto-remediation:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "auto-remediation"
image:
repository: *autoRemediationImage
tag: "1.1.1"
service:
port: 8063
env:
app:
AGENT_TYPE: "auto_remediation"
PORT: "8063"

# Incident Management System - manages and tracks incidents and responses
incident-manager:
enabled: true # CORE SERVICE - Always required (requires OpenAI API key)
fullnameOverride: "incident-manager"
image:
repository: *incidentManagerImage
tag: "1.1.1"
service:
port: 8064
env:
app:
AGENT_TYPE: "incident_manager"
PORT: "8064"


# Service Topology Mapping - DataDog integration for service graphs and monitoring
service-graph-engine:
enabled: false # OPTIONAL SERVICE - Enable for DataDog integration (requires DD_API_KEY, DD_APP_KEY)
fullnameOverride: "service-graph-engine"
image:
repository: *serviceGraphEngineImage
tag: "1.1.1"
service:
port: 8074
env:
app:
AGENT_TYPE: "service_graph_engine"
PORT: "8074"
# Note: APM_PROVIDER, UPDATE_INTERVAL_SECONDS, DD_* variables are provided via globalSecretEnv
# Note: LOG_LEVEL is provided via globalSecretEnv


# Infrastructure Services
# Infrastructure Inventory - discovers and tracks infrastructure components
active-inventory:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "active-inventory"
image:
repository: *activeInventoryImage
tag: "1.1.1"
service:
port: 8065
env:
app:
AGENT_TYPE: "active_inventory"
PORT: "8065"


# Infrastructure Monitoring Agent - monitors infrastructure health and metrics
infra-agent:
enabled: true # CORE SERVICE - Always required (no external credentials needed)
fullnameOverride: "infra-agent"
image:
repository: *infraAgentImage
tag: "1.1.1"
service:
port: 8051
ingress:
enabled: false
env:
app:
AGENT_TYPE: "infra_agent"
PORT: "8051"


# MCP Integration Services (Model Context Protocol)
# AWS Model Context Protocol Server - provides AWS EC2/CloudWatch integration
aws-mcp:
enabled: false # OPTIONAL MCP - Enable for AWS integration (requires AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ROLE_ARN_AWS_MCP)
fullnameOverride: "aws-mcp"
image:
repository: *awsMcpImage
tag: "1.1.1"
service:
port: 8080
env:
app:
AGENT_TYPE: "aws_mcp"
PORT: "8080"


# Kubernetes Model Context Protocol Server - provides Kubernetes cluster integration
k8s-mcp:
enabled: true # CORE MCP SERVICE - Always required (requires kubeconfig file)
fullnameOverride: "k8s-mcp"
image:
repository: *k8sMcpImage
tag: "1.1.1"
service:
port: 8080
# Kubeconfig configuration (inherits from global.kubeconfig.content)
env:
app:
AGENT_TYPE: "k8s_mcp"
PORT: "8080"
# Note: KUBECONFIG is provided via globalSecretEnv


# Prometheus Model Context Protocol Server - provides Prometheus metrics integration
prometheus-mcp:
enabled: false # OPTIONAL MCP - Enable for Prometheus integration (requires PROMETHEUS_URL, PROMETHEUS_MCP_USERNAME, PROMETHEUS_MCP_PASSWORD)
fullnameOverride: "prometheus-mcp"
image:
repository: *prometheusMcpImage
tag: "1.1.1"
service:
port: 8041
env:
app:
AGENT_TYPE: "prometheus_mcp"
PORT: "8041"
# Note: PROMETHEUS_MCP_USERNAME and PROMETHEUS_MCP_PASSWORD are provided via globalSecretEnv


# Neo4j Model Context Protocol Server - provides graph database integration
neo4j-mcp:
enabled: true # CORE MCP SERVICE - Enabled by default (connects to internal neo4j, optional NEO4J_MCP_USERNAME, NEO4J_MCP_PASSWORD)
fullnameOverride: "neo4j-mcp"
image:
repository: *neo4jMcpImage
tag: "1.1.1"
service:
port: 8080
env:
app:
AGENT_TYPE: "neo4j_mcp"
PORT: "8080"
MCP_SERVER_PORT: "8080"
# Note: LOG_LEVEL is provided via globalSecretEnv


# Loki Model Context Protocol Server - provides log aggregation integration
loki-mcp:
enabled: false # OPTIONAL MCP - Enable for Loki integration (requires LOKI_URL, optional LOKI_USERNAME, LOKI_PASSWORD, LOKI_TOKEN)
fullnameOverride: "loki-mcp"
image:
repository: *lokiMcpImage
tag: "1.1.1"
service:
port: 8089
env:
app:
AGENT_TYPE: "loki_mcp"
PORT: "8089"
LOKI_ORG_ID: "${LOKI_ORG_ID}"
LOKI_TENANT_ID: "default"
QUERY_TIMEOUT: "30s"
MAX_QUERY_LENGTH: "10000"
ENABLE_LOGS_TAILING: "true"
MCP_SERVER_PORT: "8089"
MCP_SERVER_HOST: "0.0.0.0"
MCP_LOG_LEVEL: "INFO"
MCP_ENABLE_METRICS: "true"
# Note: LOG_LEVEL is provided via globalSecretEnv


# CloudWatch Model Context Protocol Server - provides AWS CloudWatch integration
cloudwatch-mcp:
enabled: false # OPTIONAL MCP - Enable for AWS CloudWatch integration (requires AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, CloudWatch permissions)
fullnameOverride: "cloudwatch-mcp"
image:
repository: *cloudwatchMcpImage
tag: "1.1.1"
service:
port: 8000
env:
app:
AGENT_TYPE: "cloudwatch_mcp"
SERVER_TRANSPORT: "sse"
HOST: "0.0.0.0"
SERVER_PORT: "8000"
FASTMCP_LOG_LEVEL: "DEBUG"


# Data Ingestion Services
# Kubernetes Events Collection - ingests and processes Kubernetes cluster events
kubernetes-events-ingester:
enabled: true # OPTIONAL SERVICE - Enable for K8s event monitoring (requires kubeconfig file)
fullnameOverride: "kubernetes-events-ingester"
image:
repository: *kubernetesEventsIngesterImage
tag: "1.1.1"
service:
port: 8065
# Kubeconfig configuration (uses global kubeconfig secret)
kubeconfig:
secretRef:
enabled: true
name: "kubeconfig-secret"
key: "config"
env:
app:
AGENT_TYPE: "kubernetes_events_ingester"
PORT: "8065"
EVENTS_NAMESPACE: "default"
EVENTS_FILTER: "Warning,Error"
INGESTION_INTERVAL: "30"
OTEL_SERVICE_NAME: "kubernetes-events-ingester"
# Note: LOG_LEVEL and KUBECONFIG are provided via globalSecretEnv


# Slack Message Ingestion - ingests and processes Slack messages and notifications
slack-ingester:
enabled: false # OPTIONAL SERVICE - Enable for Slack integration (requires SLACK_BOT_TOKEN, optional SLACK_WEBHOOK_URL)
fullnameOverride: "slack-ingester"
image:
repository: *slackIngesterImage
tag: "1.1.1"
service:
port: 8065
env:
app:
AGENT_TYPE: "slack_ingester"
PORT: "8065"
SLACK_CHANNELS: "#avesha-agent-alerts"
NOTIFICATION_INTERVAL: "300"
SLACK_AFTER_TS_UTC: "2025-07-24 07:00:00"
SLACK_PARSER_CONFIG: "/home/appuser/config/slack_parser_config.json"
# Note: SLACK_BOT_TOKEN is provided via globalSecretEnv
# Note: OTEL_COLLECTOR_ENDPOINT, OTEL_SERVICE_NAME are provided via globalSecretEnv
# Note: LOG_LEVEL is provided via globalSecretEnv
volumes:
config:
enabled: true
mountPath: /home/appuser/config
subPath: ""
configMap:
enabled: true
files:
slack_parser_config.json: |
{
"application": "^(.*) was deployed",
"syncRevision": "Revision: (.*)$",
"syncStatus": "in (.*) status",
"details": "https://localhost:4000/applications/(.*)",
"repo": "Repo: (.*)$",
"path": "Path: (.*)$",
"revision": "^Revision: (.*)$",
"severity": "Type: (Warning|Error)"
}


# AWS CloudWatch Alarms Monitoring - monitors and processes AWS CloudWatch alarms
aws-ec2-cloudwatch-alarms:
enabled: false # OPTIONAL SERVICE - Enable for AWS CloudWatch monitoring (requires AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ROLE_ARN_EC2_CLOUDWATCH_ALARMS)
fullnameOverride: "aws-ec2-cloudwatch-alarms"
image:
repository: *awsEc2CloudwatchAlarmsImage
tag: "1.1.1"
service:
port: 8065
env:
app:
AGENT_TYPE: "aws_ec2_cloudwatch_alarms"
PORT: "8065"
CLOUDWATCH_NAMESPACE: "AWS/EC2"
ALARM_CHECK_INTERVAL: "300"
INSTANCE_TAG_FILTER: "Environment=production"
OTEL_SERVICE_NAME: "aws-ec2-cloudwatch-alarms"
SLO_CONFIG_FILE: "/home/appuser/slo-config.json"
INVENTORY_SAMPLING_INTERVAL_SECONDS: "120"
# Note: METRICS_SAMPLING_INTERVAL_SECONDS and LOG_LEVEL are provided via globalSecretEnv