import { eval_, runtime } from "@evaris/sdk";
function requireEnv(name: string): string {
const value = process.env[name]?.trim();
if (!value) {
throw new Error(`Missing required env var ${name}`);
}
return value;
}
function readPositiveInt(name: string, fallback: number): number {
const raw = process.env[name]?.trim();
if (!raw) {
return fallback;
}
const value = Number.parseInt(raw, 10);
if (!Number.isFinite(value) || value <= 0) {
throw new Error(`${name} must be a positive integer`);
}
return value;
}
async function main(): Promise<void> {
const runtimeBaseUrl =
process.env.EVARIS_RUNTIME_URL?.trim() ?? "http://127.0.0.1:8100";
const platformBaseUrl =
process.env.EVARIS_PLATFORM_URL?.trim() ?? "http://localhost:3000";
const projectId = requireEnv("EVARIS_PROJECT_ID");
const apiKey = requireEnv("EVARIS_API_KEY");
const suiteId = process.env.EVARIS_SUITE_ID?.trim();
const agentModel =
process.env.EVARIS_MODEL?.trim() ?? "openrouter/openai/gpt-4o-mini";
const judgeModel =
process.env.EVARIS_JUDGE_MODEL?.trim() ?? "openrouter/openai/gpt-4o-mini";
const limit = readPositiveInt("EVARIS_HOTPOT_LIMIT", 25);
const webSearchProvider = process.env.EVARIS_WEB_SEARCH_PROVIDER?.trim();
const enableWebBrowser =
(process.env.EVARIS_ENABLE_WEB_BROWSER?.trim().toLowerCase() ?? "") ===
"true";
const agentTools = [
...(webSearchProvider
? [eval_.tools.webSearch({ provider: webSearchProvider })]
: []),
...(enableWebBrowser
? [eval_.tools.webBrowser({ interactive: false })]
: []),
];
const client = new runtime.Client({
baseUrl: runtimeBaseUrl,
auth: runtime.platformApiKeyAuth({
platformBaseUrl,
projectId,
apiKey,
}),
});
const evalDefinition = eval_.define({
suite_id: suiteId,
id: "react-support-research-hotpotqa",
description:
"ReAct research agent eval for support and operations escalations using HotpotQA.",
data: eval_.datasets.huggingface("ParthMandaliya/hotpot_qa", {
name: "distractor",
split: "validation",
limit,
shuffle: true,
seed: 17,
sample_fields: {
input: "question",
target: "answer",
},
}),
run: eval_.agentRunner({
type: "inspect_native",
mode: "react",
model: {
name: agentModel,
},
steps: [
eval_.steps.systemMessage(
[
"You are a support escalation research agent.",
"Investigate the question carefully before answering.",
"Use tools when you need evidence.",
"Return a concise final answer with no extra commentary.",
].join("\n"),
),
eval_.steps.userMessage(
[
"Customer escalation question:",
"{input}",
"",
"Work like a ReAct agent: search, inspect sources, then answer.",
].join("\n"),
),
eval_.steps.agent(
agentTools,
{
prompt:
"Prefer a short search loop. Verify the answer before you submit it.",
messageLimit: 12,
maxAttempts: 2,
truncation: "auto",
},
),
],
config: {
use_case: "support-escalation-research",
},
}),
score: [
eval_.scorers.f1(),
eval_.scorers.match({
location: "any",
ignoreCase: true,
ignoreWhitespace: true,
ignorePunctuation: true,
}),
eval_.scorers.modelGradedQa({
model: judgeModel,
partialCredit: true,
instructions: [
"Grade the answer on factual correctness.",
"Reward answers that resolve the question directly.",
"Penalize unsupported claims and missing key entities.",
].join(" "),
}),
],
channel: "sdk",
labels: {
cookbook: "react-agent",
dataset: "hotpotqa",
use_case: "support-research",
},
params: {
cookbook: {
scenario: "react-support-research",
dataset: "ParthMandaliya/hotpot_qa",
split: "validation",
},
},
});
const submitted = await client.submitEval(evalDefinition);
const run = await client.waitForRun(submitted.job_id, {
pollIntervalMs: 5_000,
timeoutMs: 30 * 60 * 1_000,
});
console.log(
JSON.stringify(
{
job_id: submitted.job_id,
suite_id: run.suite_id,
status: run.status,
},
null,
2,
),
);
}
main().catch((error: unknown) => {
console.error(error);
process.exitCode = 1;
});