Skip to main content
The EvalSuite class groups multiple EvalTest instances and provides aggregate metrics across all tests.

Import

import { EvalSuite, EvalTest } from "@mcpjam/sdk";

Constructor

new EvalSuite(options: EvalSuiteOptions)

Parameters

options
EvalSuiteOptions
required
Configuration for the evaluation suite.

EvalSuiteOptions

PropertyTypeRequiredDescription
namestringYesName for the suite

Example

const suite = new EvalSuite({ name: "Math Operations" });

Methods

add()

Adds a test to the suite.
add(test: EvalTest): void

Parameters

ParameterTypeDescription
testEvalTestThe test to add

Example

suite.add(new EvalTest({
  name: "addition",
  test: async (agent) => {
    const r = await agent.prompt("Add 2 and 3");
    return r.hasToolCall("add");
  },
}));

suite.add(new EvalTest({
  name: "multiplication",
  test: async (agent) => {
    const r = await agent.prompt("Multiply 4 by 5");
    return r.hasToolCall("multiply");
  },
}));

run()

Runs all tests in the suite.
run(agent: TestAgent, options: RunOptions): Promise<void>

Parameters

ParameterTypeDescription
agentTestAgentThe agent to test with
optionsRunOptionsRun configuration (same as EvalTest)

RunOptions

PropertyTypeRequiredDefaultDescription
iterationsnumberYes-Number of runs per test
concurrencynumberNo1Parallel runs per test
retriesnumberNo0Retry failed tests
timeoutMsnumberNo60000Timeout per test (ms)
onProgressProgressCallbackNo-Progress callback

Example

await suite.run(agent, {
  iterations: 30,
  concurrency: 5,
});
Tests within the suite run sequentially, but each test’s iterations can run concurrently based on the concurrency setting.

accuracy()

Returns the aggregate accuracy across all tests.
accuracy(): number

Returns

number - Average accuracy of all tests (0.0 - 1.0).

Example

console.log(`Suite accuracy: ${(suite.accuracy() * 100).toFixed(1)}%`);

get()

Retrieves a specific test by name.
get(name: string): EvalTest | undefined

Parameters

ParameterTypeDescription
namestringThe test name

Returns

EvalTest | undefined - The test, or undefined if not found.

Example

const addTest = suite.get("addition");
if (addTest) {
  console.log(`Addition: ${(addTest.accuracy() * 100).toFixed(1)}%`);
}

getTests()

Returns all tests in the suite.
getTests(): EvalTest[]

Returns

EvalTest[] - Array of all tests.

Example

for (const test of suite.getTests()) {
  console.log(`${test.name}: ${(test.accuracy() * 100).toFixed(1)}%`);
}

Properties

name

The suite’s name.
suite.name // "Math Operations"

Complete Example

import { MCPClientManager, TestAgent, EvalSuite, EvalTest } from "@mcpjam/sdk";

async function main() {
  // Setup
  const manager = new MCPClientManager({
    everything: {
      command: "npx",
      args: ["-y", "@modelcontextprotocol/server-everything"],
    },
  });
  await manager.connectToServer("everything");

  const agent = new TestAgent({
    tools: await manager.getTools(),
    model: "anthropic/claude-sonnet-4-20250514",
    apiKey: process.env.ANTHROPIC_API_KEY,
    temperature: 0.1,
  });

  // Build suite
  const suite = new EvalSuite({ name: "Everything Server Tests" });

  suite.add(new EvalTest({
    name: "add",
    test: async (a) => (await a.prompt("Add 2+3")).hasToolCall("add"),
  }));

  suite.add(new EvalTest({
    name: "echo",
    test: async (a) => (await a.prompt("Echo 'test'")).hasToolCall("echo"),
  }));

  suite.add(new EvalTest({
    name: "longRunningOperation",
    test: async (a) => (await a.prompt("Run a long operation")).hasToolCall("longRunningOperation"),
  }));

  // Run
  console.log(`Running ${suite.name}...\n`);

  await suite.run(agent, {
    iterations: 20,
    concurrency: 3,
    onProgress: (done, total) => {
      process.stdout.write(`\r  Progress: ${done}/${total}`);
    },
  });

  // Report
  console.log("\n\n📊 Results");
  console.log("═".repeat(40));
  console.log(`Overall: ${(suite.accuracy() * 100).toFixed(1)}%\n`);

  console.log("Per-test breakdown:");
  for (const test of suite.getTests()) {
    const pct = (test.accuracy() * 100).toFixed(1);
    const icon = test.accuracy() >= 0.9 ? "✅" : test.accuracy() >= 0.7 ? "⚠️" : "❌";
    console.log(`  ${icon} ${test.name}: ${pct}%`);
  }

  // Access individual test
  const echoTest = suite.get("echo");
  if (echoTest) {
    console.log(`\nEcho test details:`);
    console.log(`  Precision: ${(echoTest.precision() * 100).toFixed(1)}%`);
    console.log(`  Recall: ${(echoTest.recall() * 100).toFixed(1)}%`);
    console.log(`  Avg tokens: ${echoTest.averageTokenUse()}`);
  }

  // Cleanup
  await manager.disconnectServer("everything");
}

Patterns

CI Quality Gate

await suite.run(agent, { iterations: 30 });

if (suite.accuracy() < 0.90) {
  console.error(`❌ Suite accuracy ${(suite.accuracy() * 100).toFixed(1)}% below 90% threshold`);
  process.exit(1);
}

console.log("✅ All quality gates passed");

Per-Test Thresholds

await suite.run(agent, { iterations: 30 });

const criticalTests = ["createOrder", "processPayment"];
let failed = false;

for (const name of criticalTests) {
  const test = suite.get(name);
  if (test && test.accuracy() < 0.95) {
    console.error(`❌ Critical test "${name}" below 95%`);
    failed = true;
  }
}

if (failed) process.exit(1);

Comparing Across Providers

const providers = [
  { model: "anthropic/claude-sonnet-4-20250514", key: "ANTHROPIC_API_KEY" },
  { model: "openai/gpt-4o", key: "OPENAI_API_KEY" },
];

for (const { model, key } of providers) {
  const agent = new TestAgent({
    tools,
    model,
    apiKey: process.env[key],
  });

  await suite.run(agent, { iterations: 20 });
  console.log(`${model}: ${(suite.accuracy() * 100).toFixed(1)}%`);
}