Skip to main content
Version: 1.11.0

Inference Endpoints

An Inference Endpoint is a hosted service to perform inference tasks such as making predictions or generating outputs using a pre-trained AI model.

Create an Inference Endpoint

Use this API to create an Inference Endpoint.

Method and URL

POST /api/v1/inference-endpoint

Parameters

ParameterParameter TypeDescriptionRequired
clusterNameStringThis is the name of the cluster.Mandatory
endpointNameStringThe name of the endpoint.Mandatory
workspaceStringThe name of the slice workspace.Mandatory
modelSpecObjectThis contains the AI model details.Mandatory
gpuSpecObjectThis contains the GPU specifications. For CPU-based inference specify the value as None.Mandatory
Model Spec Parameters
ParameterParameter TypeDescriptionRequired
modelNameStringThe AI model name.Mandatory
storageURIStringThe URI (Uniform Resource Identifier) of the model object exposed via an HTTP or HTTPs endpoint.Mandatory
GPU Spec Parameters
ParameterParameter TypeDescriptionRequired
gpuShapeStringThe name of the GPU type that you can get from the Inventory details.Mandatory
instanceTypeStringThe type of the instance requested for.Mandatory
memoryPerGPUIntegerThe memory requirement in GB per GPU.Mandatory
numberOfGPUsIntegerThe number of GPUs requested.Mandatory
numberOfGPUNodesIntegerThe number of GPU nodes requested.Mandatory

Example Request

{
"clusterName": "{{clusterName}}",
"endpointName": "{{endpointName}}",
"workspace": "{{workspaceName}}",
"modelSpec": {
"modelName": "sklearn",
"storageURI": "gs://kfserving-examples/models/sklearn/1.0/model"
}
}

The specifications for GPU can be passed for GPU-based inference:

 "gpuSpec": {
"gpuShape": "hello",
"instanceType": "world",
"memoryPerGPU": 1024,
"numberOfGPUNodes": 1,
"numberOfGPUs": 1
},

Example Responses

Response: Success

{
"statusCode": 200,
"status": "OK",
"message": "Success",
"data": {
"endpointName": "deepankar-hf-llm3"
}
}

Response: Bad Request

{
"statusCode": 422,
"status": "UNKNOWN",
"message": "UNKNOWN",
"data": null,
"error": {
"errorKey": "UNKNOWN",
"message": "UNKNOWN",
"data": "{\"error\":{\"Message\":\"Error while fetching GPR wait time\",\"Status\":422,\"DetailedError\":{\"Errormessage\":\"\\\"exitDuration\\\" is not allowed to be empty\",\"Reason\":\"\",\"Details\":\"\"},\"StatusCode\":422,\"ReferenceDocumentation\":\"NA\"}}"
}
}

Create an Inference Endpoint with Raw Specs

Use this API to create an Inference Endpoint with raw specs.

Method and URL

POST /api/v1/inference-endpoint

Parameter Description

ParameterParameter TypeDescriptionRequired
clusterNameStringThis is the name of the cluster.Mandatory
endpointNameStringThe name of the endpoint.Mandatory
gpuSpecObjectThis contains the GPU specifications. For CPU-based inference, specify the value as None.Mandatory
workspaceStringThe name of the slice workspace.Mandatory
rawModelSpecStringThe custom model specifications.Mandatory

Example Request

{
"clusterName": "worker-1",
"endpointName": "sklearn-iris",
"gpuSpec": {
"gpuShape": "hello",
"instanceType": "world",
"memoryPerGPU": 1024,
"numberOfGPUNodes": 1,
"numberOfGPUs": 1
},
"workspace": "team-beta",
"rawModelSpec": "apiVersion: \"serving.kserve.io/v1beta1\"\nkind: \"InferenceService\"\nmetadata:\n name: \"sklearn-iris\"\nspec:\n predictor:\n model:\n modelFormat:\n name: sklearn\n storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\"\n"
}

Example Responses

Response; Success

{
"statusCode": 200,
"status": "OK",
"message": "Success",
"data": {
"endpointName": "deepankar-hf-llm3"
}
}

Response: Bad Request

{
"statusCode": 422,
"status": "UNKNOWN",
"message": "UNKNOWN",
"data": null,
"error": {
"errorKey": "UNKNOWN",
"message": "UNKNOWN",
"data": "{\"error\":{\"Message\":\"Error while fetching GPR wait time\",\"Status\":422,\"DetailedError\":{\"Errormessage\":\"\\\"exitDuration\\\" is not allowed to be empty\",\"Reason\":\"\",\"Details\":\"\"},\"StatusCode\":422,\"ReferenceDocumentation\":\"NA\"}}"
}
}

List Inference Endpoints

Use this API to list the Inference Endpoints.

Method and URL

GET /api/v1/inference-endpoint/list?workspace

Example Request

GET /api/v1/inference-endpoint?workspace=team-omega

Example Responses

Response: Success

{
"statusCode": 200,
"status": "OK",
"message": "Success",
"data": {
"endpoints": [
{
"endpointName": "deepankar-hf-llm3",
"modelName": "huggingface",
"status": "Running",
"endpoint": "https://deepankar-hf-llm3.team-omega.aveshalabs.io/v1/model/huggingface-llama3:predict",
"clusterName": "worker-1",
"namespace":"team-omega-deepankar-hf-ah1cex"
},
{
"endpointName": "eric-gpt4",
"modelName": "gpt4-o-preview",
"status": "Running",
"endpoint": "https://eric-gpt4.team-omega.aveshalabs.io/v1/model/gpt4-o-preview:predict",
"clusterName": "worker-1",
"namespace":"team-omega-eric-gpt4-p8j7fa"
},
{
"endpointName": "eric-gpt4-2",
"modelName": "gpt4-o-preview",
"status": "Image Pull Backoffice",
"endpoint": "",
"clusterName": "worker-1",
"namespace":"team-omega-eric-gpt4-2-y5n6ka"
},
{
"endpointName": "mini-gpt4",
"modelName": "gpt4-mini",
"status": "Pending",
"endpoint": "",
"clusterName": "worker-2",
"namespace":"team-omega-mini-gpt4-l1uh5g"
}
]
}
}

Describe an Inference Endpoint

Use this API to describe the Inference Endpoint.

Method and URL

GET /api/v1/inference-endpoint?workspace&endpoint

Example Request

GET /api/v1/inference-endpoint?workspace=team-omega&endpoint=deepankar-hf-llm3

Example Responses

Response: Success

{
"statusCode": 200,
"status": "OK",
"message": "Success",
"data": {
"endpoint": {
"endpointName": "deepankar-hf-llm3",
"modelName": "huggingface",
"status": "Running",
"endpoint": "https://deepankar-hf-llm3.team-omega.aveshalabs.io/v1/model/huggingface-llama3:predict",
"clusterName": "worker-1",
"namespace": "team-omega-deepankar-hf-ah1cex",
"predictStatus": "Ready",
"ingressStatus": "Ready",
"tryCommand": [
"curl -v \\",
"-H \"Host: deepankar-hf-llm3.team-omega.example.com\" \\",
"-H \"Content-Type: application/json\" \\",
"\"http://192.168.1.130/v1/models/sklearn-iris:predict\" \\",
"-d '{...<request body>...}'"
],
"dnsRecords": [
{
"dns": "deepankar-hf-llm3.team-omega.example.com",
"type": "A",
"value": "192.168.1.130"
},
{
"dns": "deepankar-hf-llm3.team-omega.dev-example.com",
"type": "A",
"value": "192.168.1.130"
}
],
"gpuRequests": [
{
"gprName": "deepankar-hf-llm3-gpr-gh6aj7",
"gprId": "yyyyyyy",
"instanceType": "VM.GPU.A10.2",
"gpuShape": "nVidia A-10",
"numberOfGPUs": 1,
"numberOfGPUNodes": 1,
"memoryPerGPU": "24576",
"status": "Provisioned"
},
{
"gprName": "deepankar-hf-llm3-gpr-lo9ju7",
"gprId": "xxxxxxxxx",
"instanceType": "VM.GPU.A10.2",
"gpuShape": "nVidia A-10",
"numberOfGPUs": 1,
"numberOfGPUNodes": 1,
"memoryPerGPU": "24576",
"status": "Released"
}
]
}
}
}

Delete an Inference Endpoint

Use this API to delete an Inference Endpoint.

Method and URL

DELETE /api/v1/inference-endpoint

Example Request

DELETE /api/v1/inference-endpoint
{
"endpoint": "{{inferenceEndpointName}}",
"workspace": "team-omega"
}

Example Responses

Response: Success

{
"statusCode": 200,
"status": "OK",
"message": "Success",
"data": {}
}

Response: Cannot Delete

{
"statusCode": 500,
"status": "UNKNOWN",
"message": "UNKNOWN",
"data": null,
"error": {
"errorKey": "UNKNOWN",
"message": "UNKNOWN",
"data": "{\"error\":{\"Message\":\"Error while deleting GPR\",\"Status\":500,\"DetailedError\":{\"Errormessage\":\"Cannot delete GPR in Successful state\",\"Reason\":\"\",\"Details\":\"\"},\"StatusCode\":500,\"ReferenceDocumentation\":\"NA\"}}"
}
}