How to Evaluate Assistants#

Download Python Script

Python script/notebook for this guide.

Prerequisites

This guide assumes familiarity with:

WayFlow Agents

Evaluating the robustness and performance of assistants requires careful, reproducible measurement. You can benchmark assistants on a dataset and report metrics. This is what the AssistantEvaluator is designed for.

The AssistantEvaluator works as follows:

Evaluation is performed by running an AssistantEvaluator over a set of EvaluationTask instances within an EvaluationEnvironment. The environment provides the assistant under test, a human proxy (if needed), and optional lifecycle hooks (init/reset). Metrics are produced by TaskScorer implementations attached to the tasks.

WayFlow supports several LLM API providers. Select an LLM from the options below:

from wayflowcore.models import OCIGenAIModel

if __name__ == "__main__":

    llm = OCIGenAIModel(
        model_id="provider.model-id",
        service_endpoint="https://url-to-service-endpoint.com",
        compartment_id="compartment-id",
        auth_type="API_KEY",
    )

from wayflowcore.models import VllmModel

llm = VllmModel(
    model_id="model-id",
    host_port="VLLM_HOST_PORT",
)

from wayflowcore.models import OllamaModel

llm = OllamaModel(
    model_id="model-id",
)

Basic implementation#

A typical end-to-end evaluation includes:

Defining an evaluation environment that supplies the assistant and (optionally) a human proxy.
Implementing one or more task scorers to compute metrics.
Preparing a set of evaluation tasks (dataset).
Running the evaluator and collecting results.

Define the evaluation environment:

class MathEnvironment(EvaluationEnvironment):
    def __init__(self, env_id: str, llm: LlmModel):
        self.llm = llm
        self.assistant: ConversationalComponent = None
        self.human_proxy: HumanProxyAssistant = None
        super().__init__(env_id=env_id)

    def get_assistant(self, task: EvaluationTask) -> ConversationalComponent:
        if self.assistant is not None:
            return self.assistant

        self.assistant = Agent(
            llm=self.llm,
            custom_instruction="""The assistant is MathAssistant, tasked with answering math related questions from users.
When asked a question, the assistant should use mathematical reasoning to compute the correct answer. Remember that you have no tool for this job,
so only use your internal computation skills. The output format should be as follows:
Result: [RESULT]""",
        )
        return self.assistant

    def get_human_proxy(self, task: EvaluationTask) -> ConversationalComponent:
        if self.human_proxy is not None:
            return self.human_proxy
        self.human_proxy = HumanProxyAssistant(
            llm=self.llm,
            full_task_description=task.description,
            short_task_description=task.description,
            assistant_role="An helpful math assistant, whose job is to answer math related questions involving simple math reasoning.",
            user_role="A user having a math-related question. He wants the answer to be formatted in the following format:\nResult: [RESULT]",
        )
        return self.human_proxy

    def init_env(self, task: EvaluationTask):
        pass

    def reset_env(self, task: EvaluationTask):
        pass


math_env = MathEnvironment(env_id="math", llm=llm)

Create a task scorer to compute metrics from the assistant conversation:

class MathScorer(TaskScorer):
    OUTPUT_METRICS = ["absolute_error"]
    DEFAULT_SCORER_ID = "math_scorer"

    def score(
        self,
        environment: MathEnvironment,
        task: EvaluationTask,
        assistant: ConversationalComponent,
        assistant_conversation: Conversation,
    ) -> Dict[str, float]:
        last_assistant_message = assistant_conversation.get_last_message().content.lower()
        if "result:" not in last_assistant_message:
            raise ValueError("Incorrect output formatting")
        assistant_answer = last_assistant_message.split("result:")[-1]
        assistant_answer = assistant_answer.split("\n")[0].replace("$", "").strip()
        assistant_answer = float(assistant_answer)
        expected_answer = task.scoring_kwargs["expected_output"]
        error = abs(expected_answer - assistant_answer)
        return {"absolute_error": error}

    def score_exceptional_case(
        self,
        environment: MathEnvironment,
        exception: Exception,
        task: EvaluationTask,
        assistant: ConversationalComponent,
        assistant_conversation: Conversation,
    ) -> Dict[str, float]:
        return {"absolute_error": None}


scorers = [MathScorer(scorer_id="benefit_scorer1")]

Prepare the evaluation configuration (dataset and tasks):

data = [
    {
        "query": "What is the answer to the question: 2+2 = ?",
        "expected_output": 4,
    },
    {
        "query": "What is the answer to the question: 2x2 = ?",
        "expected_output": 4,
    },
    {
        "query": "What is the answer to the question: 2-2 = ?",
        "expected_output": 0,
    },
    {
        "query": "What is the answer to the question: 2/2 = ?",
        "expected_output": 1,
    },
]
tasks = [
    EvaluationTask(
        task_id=f"task_{i}",
        description=question["query"],
        scorers=scorers,
        scoring_kwargs={"expected_output": question["expected_output"]},
    )
    for i, question in enumerate(data)
]

Run the evaluation and inspect the results:

evaluator = AssistantEvaluator(
    environment=math_env,
    max_conversation_rounds=1,
)
results = evaluator.run_benchmark(tasks, N=1)
print(results)
#   task_id  task_attempt_number  absolute_error            conversation
# 0  task_0                    0             0.0   [Message(content='...
# 1  task_1                    0             0.0   [Message(content='...
# 2  task_2                    0             0.0   [Message(content='...
# 3  task_3                    0             0.0   [Message(content='...

Hint

Task kwargs vs Scoring kwargs

Use task kwargs to parameterize task execution (information the assistant needs).
Use scoring kwargs to store ground truth and other scoring parameters.

Important

Task scorers must extend TaskScorer and follow its API. See the API docs for details.

Next steps#

Having learned how to evaluate WayFlow Assistants end-to-end, you can proceed to:

How to Create Conditional Transitions in Flows to branch out depending on the agent’s response.

Full code#

Click on the card at the top of this page to download the full code for this guide or copy the code below.

# Copyright © 2025 Oracle and/or its affiliates.
#
# This software is under the Universal Permissive License
# %%[markdown]
# WayFlow Code Example - How to Evaluate Assistants
# -------------------------------------------------

# How to use:
# Create a new Python virtual environment and install the latest WayFlow version.
# ```bash
# python -m venv venv-wayflowcore
# source venv-wayflowcore/bin/activate
# pip install --upgrade pip
# pip install "wayflowcore==26.1" 
# ```

# You can now run the script
# 1. As a Python file:
# ```bash
# python howto_evaluation.py
# ```
# 2. As a Notebook (in VSCode):
# When viewing the file,
#  - press the keys Ctrl + Enter to run the selected cell
#  - or Shift + Enter to run the selected cell and move to the cell below# (UPL) 1.0 (LICENSE-UPL or https://oss.oracle.com/licenses/upl) or Apache License
# 2.0 (LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0), at your option.

# .. imports:
from typing import Dict

from wayflowcore.agent import Agent
from wayflowcore.conversation import Conversation
from wayflowcore.conversationalcomponent import ConversationalComponent
from wayflowcore.models.llmmodel import LlmModel
from wayflowcore.evaluation import (
    AssistantEvaluator,
    EvaluationEnvironment,
    EvaluationTask,
    TaskScorer,
    HumanProxyAssistant,
)


# %%[markdown]
## Define the llm

# %%
from wayflowcore.models import VllmModel

llm = VllmModel(
    model_id="LLAMA_MODEL_ID",
    host_port="LLAMA_API_URL",
)



# %%[markdown]
## Define the environment

# %%
class MathEnvironment(EvaluationEnvironment):
    def __init__(self, env_id: str, llm: LlmModel):
        self.llm = llm
        self.assistant: ConversationalComponent = None
        self.human_proxy: HumanProxyAssistant = None
        super().__init__(env_id=env_id)

    def get_assistant(self, task: EvaluationTask) -> ConversationalComponent:
        if self.assistant is not None:
            return self.assistant

        self.assistant = Agent(
            llm=self.llm,
            custom_instruction="""The assistant is MathAssistant, tasked with answering math related questions from users.
When asked a question, the assistant should use mathematical reasoning to compute the correct answer. Remember that you have no tool for this job,
so only use your internal computation skills. The output format should be as follows:
Result: [RESULT]""",
        )
        return self.assistant

    def get_human_proxy(self, task: EvaluationTask) -> ConversationalComponent:
        if self.human_proxy is not None:
            return self.human_proxy
        self.human_proxy = HumanProxyAssistant(
            llm=self.llm,
            full_task_description=task.description,
            short_task_description=task.description,
            assistant_role="An helpful math assistant, whose job is to answer math related questions involving simple math reasoning.",
            user_role="A user having a math-related question. He wants the answer to be formatted in the following format:\nResult: [RESULT]",
        )
        return self.human_proxy

    def init_env(self, task: EvaluationTask):
        pass

    def reset_env(self, task: EvaluationTask):
        pass


math_env = MathEnvironment(env_id="math", llm=llm)



# %%[markdown]
## Define the scorer

# %%
class MathScorer(TaskScorer):
    OUTPUT_METRICS = ["absolute_error"]
    DEFAULT_SCORER_ID = "math_scorer"

    def score(
        self,
        environment: MathEnvironment,
        task: EvaluationTask,
        assistant: ConversationalComponent,
        assistant_conversation: Conversation,
    ) -> Dict[str, float]:
        last_assistant_message = assistant_conversation.get_last_message().content.lower()
        if "result:" not in last_assistant_message:
            raise ValueError("Incorrect output formatting")
        assistant_answer = last_assistant_message.split("result:")[-1]
        assistant_answer = assistant_answer.split("\n")[0].replace("$", "").strip()
        assistant_answer = float(assistant_answer)
        expected_answer = task.scoring_kwargs["expected_output"]
        error = abs(expected_answer - assistant_answer)
        return {"absolute_error": error}

    def score_exceptional_case(
        self,
        environment: MathEnvironment,
        exception: Exception,
        task: EvaluationTask,
        assistant: ConversationalComponent,
        assistant_conversation: Conversation,
    ) -> Dict[str, float]:
        return {"absolute_error": None}


scorers = [MathScorer(scorer_id="benefit_scorer1")]


# %%[markdown]
## Define the evaluation config

# %%
data = [
    {
        "query": "What is the answer to the question: 2+2 = ?",
        "expected_output": 4,
    },
    {
        "query": "What is the answer to the question: 2x2 = ?",
        "expected_output": 4,
    },
    {
        "query": "What is the answer to the question: 2-2 = ?",
        "expected_output": 0,
    },
    {
        "query": "What is the answer to the question: 2/2 = ?",
        "expected_output": 1,
    },
]
tasks = [
    EvaluationTask(
        task_id=f"task_{i}",
        description=question["query"],
        scorers=scorers,
        scoring_kwargs={"expected_output": question["expected_output"]},
    )
    for i, question in enumerate(data)
]

# tasks = []

# %%[markdown]
## Run the evaluation

# %%
evaluator = AssistantEvaluator(
    environment=math_env,
    max_conversation_rounds=1,
)
results = evaluator.run_benchmark(tasks, N=1)
print(results)
#   task_id  task_attempt_number  absolute_error            conversation
# 0  task_0                    0             0.0   [Message(content='...
# 1  task_1                    0             0.0   [Message(content='...
# 2  task_2                    0             0.0   [Message(content='...
# 3  task_3                    0             0.0   [Message(content='...