How to write fast, reliable tests for ADK agents using InMemoryRunner and mocks

Why ADK testing is different

Testing AI agents is harder than testing regular code because LLM outputs are non-deterministic and live API calls are slow and expensive. ADK gives you InMemoryRunner as a lightweight test harness and dependency injection patterns that make mocking straightforward.

InMemoryRunner: the test harness

InMemoryRunner runs your agent entirely in memory without any persistence or external infrastructure. It is the right choice for unit tests.

import asyncio
import pytest
from google.adk.agents import LlmAgent
from google.adk.runners import InMemoryRunner
from google.genai import types
 
# The agent under test
greeter = LlmAgent(
    name='greeter',
    model='gemini-2.0-flash',
    instruction='Greet the user warmly and ask how you can help.',
)
 
async def run_agent(message: str) -> str:
    runner = InMemoryRunner(agent=greeter, app_name='test_app')
    session = await runner.session_service.create_session(
        app_name='test_app', user_id='test_user'
    )
    content = types.Content(
        role='user',
        parts=[types.Part(text=message)]
    )
    final_response = ''
    async for event in runner.run_async(
        user_id='test_user',
        session_id=session.id,
        new_message=content
    ):
        if event.is_final_response() and event.content:
            for part in event.content.parts:
                if part.text:
                    final_response += part.text
    return final_response
 
@pytest.mark.asyncio
async def test_greeter_responds():
    response = await run_agent('Hello!')
    assert len(response) > 0
    # Fuzzy assertion — check sentiment rather than exact words
    assert any(word in response.lower() for word in ['hello', 'hi', 'welcome', 'greet'])
 
Test for structure and intent, not exact strings. LLM outputs vary between runs. Assert that the response contains expected information, not that it matches a specific phrase.

Mocking tools for deterministic tests

The main strategy for deterministic testing: replace real tool implementations with mocked versions that return predictable outputs.

from google.adk.tools import FunctionTool
from unittest.mock import patch
 
# Real tool (makes HTTP call)
async def fetch_weather(city: str) -> dict:
    """Fetches current weather for a city."""
    # ... real implementation
    pass
 
weather_tool = FunctionTool(fetch_weather)
 
weather_agent = LlmAgent(
    name='weather_agent',
    model='gemini-2.0-flash',
    instruction='Answer weather questions.',
    tools=[weather_tool],
)
 
@pytest.mark.asyncio
async def test_weather_agent_uses_tool():
    mock_weather = {'temperature_c': 20, 'condition': 'Sunny', 'humidity_pct': 55}
 
    with patch('mymodule.fetch_weather', return_value=mock_weather):
        response = await run_agent_with(weather_agent, 'What is the weather in Paris?')
        assert '20' in response or 'sunny' in response.lower()
 

Testing tool call events directly

Instead of asserting on the final text response, you can assert on the tool call events — checking that the LLM invoked the right tool with the right arguments.

@pytest.mark.asyncio
async def test_agent_calls_correct_tool():
    runner = InMemoryRunner(agent=weather_agent, app_name='test_app')
    session = await runner.session_service.create_session(
        app_name='test_app', user_id='u1'
    )
    content = types.Content(role='user', parts=[types.Part(text='Weather in Tokyo?')])
 
    tool_calls_seen = []
    async for event in runner.run_async(user_id='u1', session_id=session.id, new_message=content):
        calls = event.get_function_calls()
        if calls:
            tool_calls_seen.extend(calls)
 
    # Assert tool was called
    assert any(c.name == 'fetch_weather' for c in tool_calls_seen), \
        'Agent should have called fetch_weather'
 
    # Assert correct argument
    weather_calls = [c for c in tool_calls_seen if c.name == 'fetch_weather']
    assert any('tokyo' in str(c.args).lower() for c in weather_calls), \
        'Should have called fetch_weather with Tokyo'
 

Testing multi-agent systems

For multi-agent setups (SequentialAgent, ParallelAgent, or custom orchestrators), test each subagent in isolation first, then test the composite system.

from google.adk.agents import SequentialAgent
 
step1 = LlmAgent(name='step1', model='gemini-2.0-flash',
                 instruction='Extract the main topic from the text.')
step2 = LlmAgent(name='step2', model='gemini-2.0-flash',
                 instruction='Write a one-paragraph summary of the topic.')
 
pipeline = SequentialAgent(name='pipeline', sub_agents=[step1, step2])
 
@pytest.mark.asyncio
async def test_pipeline_produces_summary():
    response = await run_agent_with(pipeline, 'Article: AI is transforming healthcare...')
    # Should be a coherent paragraph
    assert len(response.split()) > 20, 'Summary should be a full paragraph'
 

Pytest fixtures for reuse

import pytest
from google.adk.runners import InMemoryRunner
 
@pytest.fixture
def runner(weather_agent):
    return InMemoryRunner(agent=weather_agent, app_name='test_app')
 
@pytest.fixture
async def session(runner):
    return await runner.session_service.create_session(
        app_name='test_app', user_id='test_user'
    )
 
@pytest.mark.asyncio
async def test_with_fixtures(runner, session):
    content = types.Content(role='user', parts=[types.Part(text='Hi')])
    responses = []
    async for event in runner.run_async(
        user_id='test_user', session_id=session.id, new_message=content
    ):
        if event.is_final_response():
            responses.append(event)
    assert len(responses) > 0
 

CI/CD integration

Add these practices when running ADK tests in CI:

  • Set GOOGLE_API_KEY as a CI secret — InMemoryRunner still calls Gemini unless you mock the LLM
  • Use pytest-asyncio with asyncio_mode = auto in pytest.ini for cleaner async tests
  • Mock expensive tools (web search, code execution) in all CI tests
  • Run live integration tests nightly on a separate CI job, not on every PR
  • Use pytest -x --tb=short to stop at first failure and keep CI output readable
# pytest.ini
[pytest]
asyncio_mode = auto
markers =
    integration: marks tests as integration tests (deselect with -m 'not integration')
 
# Run only unit tests in CI
pytest -m 'not integration' tests/
 
# Run integration tests separately
pytest -m integration tests/ --timeout=60
 

What to test and what not to

Test this Do not test this
Agent calls the right tool given a specific prompt Exact wording of LLM responses
Tool functions return correct data given inputs That the LLM chose a specific phrasing
Multi-agent routing sends tasks to correct subagent Internal LLM reasoning steps
Agent handles tool errors gracefully Token count of a response
Output contains required information (fuzzy assert) Exact JSON structure from a freeform LLM call